[pornotube] move into own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.funnyordie import FunnyOrDieIE
31 from .extractor.gametrailers import GametrailersIE
32 from .extractor.generic import GenericIE
33 from .extractor.googleplus import GooglePlusIE
34 from .extractor.googlesearch import GoogleSearchIE
35 from .extractor.infoq import InfoQIE
36 from .extractor.justintv import JustinTVIE
37 from .extractor.metacafe import MetacafeIE
38 from .extractor.mixcloud import MixcloudIE
39 from .extractor.mtv import MTVIE
40 from .extractor.myvideo import MyVideoIE
41 from .extractor.nba import NBAIE
42 from .extractor.statigram import StatigramIE
43 from .extractor.photobucket import PhotobucketIE
44 from .extractor.pornotube import PornotubeIE
45 from .extractor.rbmaradio import RBMARadioIE
46 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
47 from .extractor.stanfordoc import StanfordOpenClassroomIE
48 from .extractor.steam import SteamIE
49 from .extractor.ted import TEDIE
50 from .extractor.ustream import UstreamIE
51 from .extractor.vimeo import VimeoIE
52 from .extractor.worldstarhiphop import WorldStarHipHopIE
53 from .extractor.xnxx import XNXXIE
54 from .extractor.xvideos import XVideosIE
55 from .extractor.yahoo import YahooIE, YahooSearchIE
56 from .extractor.youku import YoukuIE
57 from .extractor.youporn import YouPornIE
58 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
59 from .extractor.zdf import ZDFIE
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84 class YouJizzIE(InfoExtractor):
85     """Information extractor for youjizz.com."""
86     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
87
88     def _real_extract(self, url):
89         mobj = re.match(self._VALID_URL, url)
90         if mobj is None:
91             raise ExtractorError(u'Invalid URL: %s' % url)
92
93         video_id = mobj.group('videoid')
94
95         # Get webpage content
96         webpage = self._download_webpage(url, video_id)
97
98         # Get the video title
99         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
100             webpage, u'title').strip()
101
102         # Get the embed page
103         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
104         if result is None:
105             raise ExtractorError(u'ERROR: unable to extract embed page')
106
107         embed_page_url = result.group(0).strip()
108         video_id = result.group('videoid')
109
110         webpage = self._download_webpage(embed_page_url, video_id)
111
112         # Get the video URL
113         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
114             webpage, u'video URL')
115
116         info = {'id': video_id,
117                 'url': video_url,
118                 'title': video_title,
119                 'ext': 'flv',
120                 'format': 'flv',
121                 'player_url': embed_page_url}
122
123         return [info]
124
125 class EightTracksIE(InfoExtractor):
126     IE_NAME = '8tracks'
127     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
128
129     def _real_extract(self, url):
130         mobj = re.match(self._VALID_URL, url)
131         if mobj is None:
132             raise ExtractorError(u'Invalid URL: %s' % url)
133         playlist_id = mobj.group('id')
134
135         webpage = self._download_webpage(url, playlist_id)
136
137         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
138         data = json.loads(json_like)
139
140         session = str(random.randint(0, 1000000000))
141         mix_id = data['id']
142         track_count = data['tracks_count']
143         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
144         next_url = first_url
145         res = []
146         for i in itertools.count():
147             api_json = self._download_webpage(next_url, playlist_id,
148                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
149                 errnote=u'Failed to download song information')
150             api_data = json.loads(api_json)
151             track_data = api_data[u'set']['track']
152             info = {
153                 'id': track_data['id'],
154                 'url': track_data['track_file_stream_url'],
155                 'title': track_data['performer'] + u' - ' + track_data['name'],
156                 'raw_title': track_data['name'],
157                 'uploader_id': data['user']['login'],
158                 'ext': 'm4a',
159             }
160             res.append(info)
161             if api_data['set']['at_last_track']:
162                 break
163             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
164         return res
165
166 class KeekIE(InfoExtractor):
167     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
168     IE_NAME = u'keek'
169
170     def _real_extract(self, url):
171         m = re.match(self._VALID_URL, url)
172         video_id = m.group('videoID')
173
174         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
175         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
176         webpage = self._download_webpage(url, video_id)
177
178         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
179             webpage, u'title')
180
181         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
182             webpage, u'uploader', fatal=False)
183
184         info = {
185                 'id': video_id,
186                 'url': video_url,
187                 'ext': 'mp4',
188                 'title': video_title,
189                 'thumbnail': thumbnail,
190                 'uploader': uploader
191         }
192         return [info]
193
194
195 class MySpassIE(InfoExtractor):
196     _VALID_URL = r'http://www.myspass.de/.*'
197
198     def _real_extract(self, url):
199         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
200
201         # video id is the last path element of the URL
202         # usually there is a trailing slash, so also try the second but last
203         url_path = compat_urllib_parse_urlparse(url).path
204         url_parent_path, video_id = os.path.split(url_path)
205         if not video_id:
206             _, video_id = os.path.split(url_parent_path)
207
208         # get metadata
209         metadata_url = META_DATA_URL_TEMPLATE % video_id
210         metadata_text = self._download_webpage(metadata_url, video_id)
211         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
212
213         # extract values from metadata
214         url_flv_el = metadata.find('url_flv')
215         if url_flv_el is None:
216             raise ExtractorError(u'Unable to extract download url')
217         video_url = url_flv_el.text
218         extension = os.path.splitext(video_url)[1][1:]
219         title_el = metadata.find('title')
220         if title_el is None:
221             raise ExtractorError(u'Unable to extract title')
222         title = title_el.text
223         format_id_el = metadata.find('format_id')
224         if format_id_el is None:
225             format = ext
226         else:
227             format = format_id_el.text
228         description_el = metadata.find('description')
229         if description_el is not None:
230             description = description_el.text
231         else:
232             description = None
233         imagePreview_el = metadata.find('imagePreview')
234         if imagePreview_el is not None:
235             thumbnail = imagePreview_el.text
236         else:
237             thumbnail = None
238         info = {
239             'id': video_id,
240             'url': video_url,
241             'title': title,
242             'ext': extension,
243             'format': format,
244             'thumbnail': thumbnail,
245             'description': description
246         }
247         return [info]
248
249 class SpiegelIE(InfoExtractor):
250     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
251
252     def _real_extract(self, url):
253         m = re.match(self._VALID_URL, url)
254         video_id = m.group('videoID')
255
256         webpage = self._download_webpage(url, video_id)
257
258         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
259             webpage, u'title')
260
261         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
262         xml_code = self._download_webpage(xml_url, video_id,
263                     note=u'Downloading XML', errnote=u'Failed to download XML')
264
265         idoc = xml.etree.ElementTree.fromstring(xml_code)
266         last_type = idoc[-1]
267         filename = last_type.findall('./filename')[0].text
268         duration = float(last_type.findall('./duration')[0].text)
269
270         video_url = 'http://video2.spiegel.de/flash/' + filename
271         video_ext = filename.rpartition('.')[2]
272         info = {
273             'id': video_id,
274             'url': video_url,
275             'ext': video_ext,
276             'title': video_title,
277             'duration': duration,
278         }
279         return [info]
280
281 class LiveLeakIE(InfoExtractor):
282
283     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
284     IE_NAME = u'liveleak'
285
286     def _real_extract(self, url):
287         mobj = re.match(self._VALID_URL, url)
288         if mobj is None:
289             raise ExtractorError(u'Invalid URL: %s' % url)
290
291         video_id = mobj.group('video_id')
292
293         webpage = self._download_webpage(url, video_id)
294
295         video_url = self._search_regex(r'file: "(.*?)",',
296             webpage, u'video URL')
297
298         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
299             webpage, u'title').replace('LiveLeak.com -', '').strip()
300
301         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
302             webpage, u'description', fatal=False)
303
304         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
305             webpage, u'uploader', fatal=False)
306
307         info = {
308             'id':  video_id,
309             'url': video_url,
310             'ext': 'mp4',
311             'title': video_title,
312             'description': video_description,
313             'uploader': video_uploader
314         }
315
316         return [info]
317
318
319
320 class TumblrIE(InfoExtractor):
321     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
322
323     def _real_extract(self, url):
324         m_url = re.match(self._VALID_URL, url)
325         video_id = m_url.group('id')
326         blog = m_url.group('blog_name')
327
328         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
329         webpage = self._download_webpage(url, video_id)
330
331         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
332         video = re.search(re_video, webpage)
333         if video is None:
334            raise ExtractorError(u'Unable to extract video')
335         video_url = video.group('video_url')
336         ext = video.group('ext')
337
338         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
339             webpage, u'thumbnail', fatal=False)  # We pick the first poster
340         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
341
342         # The only place where you can get a title, it's not complete,
343         # but searching in other places doesn't work for all videos
344         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
345             webpage, u'title', flags=re.DOTALL)
346
347         return [{'id': video_id,
348                  'url': video_url,
349                  'title': video_title,
350                  'thumbnail': video_thumbnail,
351                  'ext': ext
352                  }]
353
354 class BandcampIE(InfoExtractor):
355     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
356
357     def _real_extract(self, url):
358         mobj = re.match(self._VALID_URL, url)
359         title = mobj.group('title')
360         webpage = self._download_webpage(url, title)
361         # We get the link to the free download page
362         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
363         if m_download is None:
364             raise ExtractorError(u'No free songs found')
365
366         download_link = m_download.group(1)
367         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
368                        webpage, re.MULTILINE|re.DOTALL).group('id')
369
370         download_webpage = self._download_webpage(download_link, id,
371                                                   'Downloading free downloads page')
372         # We get the dictionary of the track from some javascrip code
373         info = re.search(r'items: (.*?),$',
374                          download_webpage, re.MULTILINE).group(1)
375         info = json.loads(info)[0]
376         # We pick mp3-320 for now, until format selection can be easily implemented.
377         mp3_info = info[u'downloads'][u'mp3-320']
378         # If we try to use this url it says the link has expired
379         initial_url = mp3_info[u'url']
380         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
381         m_url = re.match(re_url, initial_url)
382         #We build the url we will use to get the final track url
383         # This url is build in Bandcamp in the script download_bunde_*.js
384         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
385         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
386         # If we could correctly generate the .rand field the url would be
387         #in the "download_url" key
388         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
389
390         track_info = {'id':id,
391                       'title' : info[u'title'],
392                       'ext' :   'mp3',
393                       'url' :   final_url,
394                       'thumbnail' : info[u'thumb_url'],
395                       'uploader' :  info[u'artist']
396                       }
397
398         return [track_info]
399
400 class RedTubeIE(InfoExtractor):
401     """Information Extractor for redtube"""
402     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
403
404     def _real_extract(self,url):
405         mobj = re.match(self._VALID_URL, url)
406         if mobj is None:
407             raise ExtractorError(u'Invalid URL: %s' % url)
408
409         video_id = mobj.group('id')
410         video_extension = 'mp4'        
411         webpage = self._download_webpage(url, video_id)
412
413         self.report_extraction(video_id)
414
415         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
416             webpage, u'video URL')
417
418         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
419             webpage, u'title')
420
421         return [{
422             'id':       video_id,
423             'url':      video_url,
424             'ext':      video_extension,
425             'title':    video_title,
426         }]
427         
428 class InaIE(InfoExtractor):
429     """Information Extractor for Ina.fr"""
430     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
431
432     def _real_extract(self,url):
433         mobj = re.match(self._VALID_URL, url)
434
435         video_id = mobj.group('id')
436         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
437         video_extension = 'mp4'
438         webpage = self._download_webpage(mrss_url, video_id)
439
440         self.report_extraction(video_id)
441
442         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
443             webpage, u'video URL')
444
445         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
446             webpage, u'title')
447
448         return [{
449             'id':       video_id,
450             'url':      video_url,
451             'ext':      video_extension,
452             'title':    video_title,
453         }]
454
455 class HowcastIE(InfoExtractor):
456     """Information Extractor for Howcast.com"""
457     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
458
459     def _real_extract(self, url):
460         mobj = re.match(self._VALID_URL, url)
461
462         video_id = mobj.group('id')
463         webpage_url = 'http://www.howcast.com/videos/' + video_id
464         webpage = self._download_webpage(webpage_url, video_id)
465
466         self.report_extraction(video_id)
467
468         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
469             webpage, u'video URL')
470
471         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
472             webpage, u'title')
473
474         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
475             webpage, u'description', fatal=False)
476
477         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
478             webpage, u'thumbnail', fatal=False)
479
480         return [{
481             'id':       video_id,
482             'url':      video_url,
483             'ext':      'mp4',
484             'title':    video_title,
485             'description': video_description,
486             'thumbnail': thumbnail,
487         }]
488
489 class VineIE(InfoExtractor):
490     """Information Extractor for Vine.co"""
491     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
492
493     def _real_extract(self, url):
494         mobj = re.match(self._VALID_URL, url)
495
496         video_id = mobj.group('id')
497         webpage_url = 'https://vine.co/v/' + video_id
498         webpage = self._download_webpage(webpage_url, video_id)
499
500         self.report_extraction(video_id)
501
502         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
503             webpage, u'video URL')
504
505         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
506             webpage, u'title')
507
508         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
509             webpage, u'thumbnail', fatal=False)
510
511         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
512             webpage, u'uploader', fatal=False, flags=re.DOTALL)
513
514         return [{
515             'id':        video_id,
516             'url':       video_url,
517             'ext':       'mp4',
518             'title':     video_title,
519             'thumbnail': thumbnail,
520             'uploader':  uploader,
521         }]
522
523 class FlickrIE(InfoExtractor):
524     """Information Extractor for Flickr videos"""
525     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
526
527     def _real_extract(self, url):
528         mobj = re.match(self._VALID_URL, url)
529
530         video_id = mobj.group('id')
531         video_uploader_id = mobj.group('uploader_id')
532         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
533         webpage = self._download_webpage(webpage_url, video_id)
534
535         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
536
537         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
538         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
539
540         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
541             first_xml, u'node_id')
542
543         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
544         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
545
546         self.report_extraction(video_id)
547
548         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
549         if mobj is None:
550             raise ExtractorError(u'Unable to extract video url')
551         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
552
553         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
554             webpage, u'video title')
555
556         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
557             webpage, u'description', fatal=False)
558
559         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
560             webpage, u'thumbnail', fatal=False)
561
562         return [{
563             'id':          video_id,
564             'url':         video_url,
565             'ext':         'mp4',
566             'title':       video_title,
567             'description': video_description,
568             'thumbnail':   thumbnail,
569             'uploader_id': video_uploader_id,
570         }]
571
572 class TeamcocoIE(InfoExtractor):
573     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
574
575     def _real_extract(self, url):
576         mobj = re.match(self._VALID_URL, url)
577         if mobj is None:
578             raise ExtractorError(u'Invalid URL: %s' % url)
579         url_title = mobj.group('url_title')
580         webpage = self._download_webpage(url, url_title)
581
582         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
583             webpage, u'video id')
584
585         self.report_extraction(video_id)
586
587         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
588             webpage, u'title')
589
590         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
591             webpage, u'thumbnail', fatal=False)
592
593         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
594             webpage, u'description', fatal=False)
595
596         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
597         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
598
599         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
600             data, u'video URL')
601
602         return [{
603             'id':          video_id,
604             'url':         video_url,
605             'ext':         'mp4',
606             'title':       video_title,
607             'thumbnail':   thumbnail,
608             'description': video_description,
609         }]
610
611 class XHamsterIE(InfoExtractor):
612     """Information Extractor for xHamster"""
613     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
614
615     def _real_extract(self,url):
616         mobj = re.match(self._VALID_URL, url)
617
618         video_id = mobj.group('id')
619         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
620         webpage = self._download_webpage(mrss_url, video_id)
621
622         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
623         if mobj is None:
624             raise ExtractorError(u'Unable to extract media URL')
625         if len(mobj.group('server')) == 0:
626             video_url = compat_urllib_parse.unquote(mobj.group('file'))
627         else:
628             video_url = mobj.group('server')+'/key='+mobj.group('file')
629         video_extension = video_url.split('.')[-1]
630
631         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
632             webpage, u'title')
633
634         # Can't see the description anywhere in the UI
635         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
636         #     webpage, u'description', fatal=False)
637         # if video_description: video_description = unescapeHTML(video_description)
638
639         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
640         if mobj:
641             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
642         else:
643             video_upload_date = None
644             self._downloader.report_warning(u'Unable to extract upload date')
645
646         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
647             webpage, u'uploader id', default=u'anonymous')
648
649         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
650             webpage, u'thumbnail', fatal=False)
651
652         return [{
653             'id':       video_id,
654             'url':      video_url,
655             'ext':      video_extension,
656             'title':    video_title,
657             # 'description': video_description,
658             'upload_date': video_upload_date,
659             'uploader_id': video_uploader_id,
660             'thumbnail': video_thumbnail
661         }]
662
663 class HypemIE(InfoExtractor):
664     """Information Extractor for hypem"""
665     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
666
667     def _real_extract(self, url):
668         mobj = re.match(self._VALID_URL, url)
669         if mobj is None:
670             raise ExtractorError(u'Invalid URL: %s' % url)
671         track_id = mobj.group(1)
672
673         data = { 'ax': 1, 'ts': time.time() }
674         data_encoded = compat_urllib_parse.urlencode(data)
675         complete_url = url + "?" + data_encoded
676         request = compat_urllib_request.Request(complete_url)
677         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
678         cookie = urlh.headers.get('Set-Cookie', '')
679
680         self.report_extraction(track_id)
681
682         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
683             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
684         try:
685             track_list = json.loads(html_tracks)
686             track = track_list[u'tracks'][0]
687         except ValueError:
688             raise ExtractorError(u'Hypemachine contained invalid JSON.')
689
690         key = track[u"key"]
691         track_id = track[u"id"]
692         artist = track[u"artist"]
693         title = track[u"song"]
694
695         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
696         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
697         request.add_header('cookie', cookie)
698         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
699         try:
700             song_data = json.loads(song_data_json)
701         except ValueError:
702             raise ExtractorError(u'Hypemachine contained invalid JSON.')
703         final_url = song_data[u"url"]
704
705         return [{
706             'id':       track_id,
707             'url':      final_url,
708             'ext':      "mp3",
709             'title':    title,
710             'artist':   artist,
711         }]
712
713 class Vbox7IE(InfoExtractor):
714     """Information Extractor for Vbox7"""
715     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
716
717     def _real_extract(self,url):
718         mobj = re.match(self._VALID_URL, url)
719         if mobj is None:
720             raise ExtractorError(u'Invalid URL: %s' % url)
721         video_id = mobj.group(1)
722
723         redirect_page, urlh = self._download_webpage_handle(url, video_id)
724         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
725         redirect_url = urlh.geturl() + new_location
726         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
727
728         title = self._html_search_regex(r'<title>(.*)</title>',
729             webpage, u'title').split('/')[0].strip()
730
731         ext = "flv"
732         info_url = "http://vbox7.com/play/magare.do"
733         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
734         info_request = compat_urllib_request.Request(info_url, data)
735         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
736         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
737         if info_response is None:
738             raise ExtractorError(u'Unable to extract the media url')
739         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
740
741         return [{
742             'id':        video_id,
743             'url':       final_url,
744             'ext':       ext,
745             'title':     title,
746             'thumbnail': thumbnail_url,
747         }]
748
749
750 def gen_extractors():
751     """ Return a list of an instance of every supported extractor.
752     The order does matter; the first extractor matched is the one handling the URL.
753     """
754     return [
755         YoutubePlaylistIE(),
756         YoutubeChannelIE(),
757         YoutubeUserIE(),
758         YoutubeSearchIE(),
759         YoutubeIE(),
760         MetacafeIE(),
761         DailymotionIE(),
762         GoogleSearchIE(),
763         PhotobucketIE(),
764         YahooIE(),
765         YahooSearchIE(),
766         DepositFilesIE(),
767         FacebookIE(),
768         BlipTVIE(),
769         BlipTVUserIE(),
770         VimeoIE(),
771         MyVideoIE(),
772         ComedyCentralIE(),
773         EscapistIE(),
774         CollegeHumorIE(),
775         XVideosIE(),
776         SoundcloudSetIE(),
777         SoundcloudIE(),
778         InfoQIE(),
779         MixcloudIE(),
780         StanfordOpenClassroomIE(),
781         MTVIE(),
782         YoukuIE(),
783         XNXXIE(),
784         YouJizzIE(),
785         PornotubeIE(),
786         YouPornIE(),
787         GooglePlusIE(),
788         ArteTvIE(),
789         NBAIE(),
790         WorldStarHipHopIE(),
791         JustinTVIE(),
792         FunnyOrDieIE(),
793         SteamIE(),
794         UstreamIE(),
795         RBMARadioIE(),
796         EightTracksIE(),
797         KeekIE(),
798         TEDIE(),
799         MySpassIE(),
800         SpiegelIE(),
801         LiveLeakIE(),
802         ARDIE(),
803         ZDFIE(),
804         TumblrIE(),
805         BandcampIE(),
806         RedTubeIE(),
807         InaIE(),
808         HowcastIE(),
809         VineIE(),
810         FlickrIE(),
811         TeamcocoIE(),
812         XHamsterIE(),
813         HypemIE(),
814         Vbox7IE(),
815         GametrailersIE(),
816         StatigramIE(),
817         GenericIE()
818     ]
819
820 def get_info_extractor(ie_name):
821     """Returns the info extractor class with the given ie_name"""
822     return globals()[ie_name+'IE']