d2b80c5580338401b7d6540bfd0fde191dd7044a
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.funnyordie import FunnyOrDieIE
31 from .extractor.gametrailers import GametrailersIE
32 from .extractor.generic import GenericIE
33 from .extractor.googleplus import GooglePlusIE
34 from .extractor.googlesearch import GoogleSearchIE
35 from .extractor.infoq import InfoQIE
36 from .extractor.justintv import JustinTVIE
37 from .extractor.metacafe import MetacafeIE
38 from .extractor.mixcloud import MixcloudIE
39 from .extractor.mtv import MTVIE
40 from .extractor.myvideo import MyVideoIE
41 from .extractor.nba import NBAIE
42 from .extractor.statigram import StatigramIE
43 from .extractor.photobucket import PhotobucketIE
44 from .extractor.rbmaradio import RBMARadioIE
45 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
46 from .extractor.stanfordoc import StanfordOpenClassroomIE
47 from .extractor.steam import SteamIE
48 from .extractor.ted import TEDIE
49 from .extractor.ustream import UstreamIE
50 from .extractor.vimeo import VimeoIE
51 from .extractor.worldstarhiphop import WorldStarHipHopIE
52 from .extractor.xnxx import XNXXIE
53 from .extractor.xvideos import XVideosIE
54 from .extractor.yahoo import YahooIE, YahooSearchIE
55 from .extractor.youku import YoukuIE
56 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
57 from .extractor.zdf import ZDFIE
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78 class YouPornIE(InfoExtractor):
79     """Information extractor for youporn.com."""
80     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
81
82     def _print_formats(self, formats):
83         """Print all available formats"""
84         print(u'Available formats:')
85         print(u'ext\t\tformat')
86         print(u'---------------------------------')
87         for format in formats:
88             print(u'%s\t\t%s'  % (format['ext'], format['format']))
89
90     def _specific(self, req_format, formats):
91         for x in formats:
92             if(x["format"]==req_format):
93                 return x
94         return None
95
96     def _real_extract(self, url):
97         mobj = re.match(self._VALID_URL, url)
98         if mobj is None:
99             raise ExtractorError(u'Invalid URL: %s' % url)
100         video_id = mobj.group('videoid')
101
102         req = compat_urllib_request.Request(url)
103         req.add_header('Cookie', 'age_verified=1')
104         webpage = self._download_webpage(req, video_id)
105
106         # Get JSON parameters
107         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
108         try:
109             params = json.loads(json_params)
110         except:
111             raise ExtractorError(u'Invalid JSON')
112
113         self.report_extraction(video_id)
114         try:
115             video_title = params['title']
116             upload_date = unified_strdate(params['release_date_f'])
117             video_description = params['description']
118             video_uploader = params['submitted_by']
119             thumbnail = params['thumbnails'][0]['image']
120         except KeyError:
121             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
122
123         # Get all of the formats available
124         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
125         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
126             webpage, u'download list').strip()
127
128         # Get all of the links from the page
129         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
130         links = re.findall(LINK_RE, download_list_html)
131         if(len(links) == 0):
132             raise ExtractorError(u'ERROR: no known formats available for video')
133
134         self.to_screen(u'Links found: %d' % len(links))
135
136         formats = []
137         for link in links:
138
139             # A link looks like this:
140             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
141             # A path looks like this:
142             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
143             video_url = unescapeHTML( link )
144             path = compat_urllib_parse_urlparse( video_url ).path
145             extension = os.path.splitext( path )[1][1:]
146             format = path.split('/')[4].split('_')[:2]
147             size = format[0]
148             bitrate = format[1]
149             format = "-".join( format )
150             # title = u'%s-%s-%s' % (video_title, size, bitrate)
151
152             formats.append({
153                 'id': video_id,
154                 'url': video_url,
155                 'uploader': video_uploader,
156                 'upload_date': upload_date,
157                 'title': video_title,
158                 'ext': extension,
159                 'format': format,
160                 'thumbnail': thumbnail,
161                 'description': video_description
162             })
163
164         if self._downloader.params.get('listformats', None):
165             self._print_formats(formats)
166             return
167
168         req_format = self._downloader.params.get('format', None)
169         self.to_screen(u'Format: %s' % req_format)
170
171         if req_format is None or req_format == 'best':
172             return [formats[0]]
173         elif req_format == 'worst':
174             return [formats[-1]]
175         elif req_format in ('-1', 'all'):
176             return formats
177         else:
178             format = self._specific( req_format, formats )
179             if result is None:
180                 raise ExtractorError(u'Requested format not available')
181             return [format]
182
183
184
185 class PornotubeIE(InfoExtractor):
186     """Information extractor for pornotube.com."""
187     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
188
189     def _real_extract(self, url):
190         mobj = re.match(self._VALID_URL, url)
191         if mobj is None:
192             raise ExtractorError(u'Invalid URL: %s' % url)
193
194         video_id = mobj.group('videoid')
195         video_title = mobj.group('title')
196
197         # Get webpage content
198         webpage = self._download_webpage(url, video_id)
199
200         # Get the video URL
201         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
202         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
203         video_url = compat_urllib_parse.unquote(video_url)
204
205         #Get the uploaded date
206         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
207         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
208         if upload_date: upload_date = unified_strdate(upload_date)
209
210         info = {'id': video_id,
211                 'url': video_url,
212                 'uploader': None,
213                 'upload_date': upload_date,
214                 'title': video_title,
215                 'ext': 'flv',
216                 'format': 'flv'}
217
218         return [info]
219
220 class YouJizzIE(InfoExtractor):
221     """Information extractor for youjizz.com."""
222     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
223
224     def _real_extract(self, url):
225         mobj = re.match(self._VALID_URL, url)
226         if mobj is None:
227             raise ExtractorError(u'Invalid URL: %s' % url)
228
229         video_id = mobj.group('videoid')
230
231         # Get webpage content
232         webpage = self._download_webpage(url, video_id)
233
234         # Get the video title
235         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
236             webpage, u'title').strip()
237
238         # Get the embed page
239         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
240         if result is None:
241             raise ExtractorError(u'ERROR: unable to extract embed page')
242
243         embed_page_url = result.group(0).strip()
244         video_id = result.group('videoid')
245
246         webpage = self._download_webpage(embed_page_url, video_id)
247
248         # Get the video URL
249         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
250             webpage, u'video URL')
251
252         info = {'id': video_id,
253                 'url': video_url,
254                 'title': video_title,
255                 'ext': 'flv',
256                 'format': 'flv',
257                 'player_url': embed_page_url}
258
259         return [info]
260
261 class EightTracksIE(InfoExtractor):
262     IE_NAME = '8tracks'
263     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
264
265     def _real_extract(self, url):
266         mobj = re.match(self._VALID_URL, url)
267         if mobj is None:
268             raise ExtractorError(u'Invalid URL: %s' % url)
269         playlist_id = mobj.group('id')
270
271         webpage = self._download_webpage(url, playlist_id)
272
273         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
274         data = json.loads(json_like)
275
276         session = str(random.randint(0, 1000000000))
277         mix_id = data['id']
278         track_count = data['tracks_count']
279         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
280         next_url = first_url
281         res = []
282         for i in itertools.count():
283             api_json = self._download_webpage(next_url, playlist_id,
284                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
285                 errnote=u'Failed to download song information')
286             api_data = json.loads(api_json)
287             track_data = api_data[u'set']['track']
288             info = {
289                 'id': track_data['id'],
290                 'url': track_data['track_file_stream_url'],
291                 'title': track_data['performer'] + u' - ' + track_data['name'],
292                 'raw_title': track_data['name'],
293                 'uploader_id': data['user']['login'],
294                 'ext': 'm4a',
295             }
296             res.append(info)
297             if api_data['set']['at_last_track']:
298                 break
299             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
300         return res
301
302 class KeekIE(InfoExtractor):
303     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
304     IE_NAME = u'keek'
305
306     def _real_extract(self, url):
307         m = re.match(self._VALID_URL, url)
308         video_id = m.group('videoID')
309
310         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
311         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
312         webpage = self._download_webpage(url, video_id)
313
314         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
315             webpage, u'title')
316
317         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
318             webpage, u'uploader', fatal=False)
319
320         info = {
321                 'id': video_id,
322                 'url': video_url,
323                 'ext': 'mp4',
324                 'title': video_title,
325                 'thumbnail': thumbnail,
326                 'uploader': uploader
327         }
328         return [info]
329
330
331 class MySpassIE(InfoExtractor):
332     _VALID_URL = r'http://www.myspass.de/.*'
333
334     def _real_extract(self, url):
335         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
336
337         # video id is the last path element of the URL
338         # usually there is a trailing slash, so also try the second but last
339         url_path = compat_urllib_parse_urlparse(url).path
340         url_parent_path, video_id = os.path.split(url_path)
341         if not video_id:
342             _, video_id = os.path.split(url_parent_path)
343
344         # get metadata
345         metadata_url = META_DATA_URL_TEMPLATE % video_id
346         metadata_text = self._download_webpage(metadata_url, video_id)
347         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
348
349         # extract values from metadata
350         url_flv_el = metadata.find('url_flv')
351         if url_flv_el is None:
352             raise ExtractorError(u'Unable to extract download url')
353         video_url = url_flv_el.text
354         extension = os.path.splitext(video_url)[1][1:]
355         title_el = metadata.find('title')
356         if title_el is None:
357             raise ExtractorError(u'Unable to extract title')
358         title = title_el.text
359         format_id_el = metadata.find('format_id')
360         if format_id_el is None:
361             format = ext
362         else:
363             format = format_id_el.text
364         description_el = metadata.find('description')
365         if description_el is not None:
366             description = description_el.text
367         else:
368             description = None
369         imagePreview_el = metadata.find('imagePreview')
370         if imagePreview_el is not None:
371             thumbnail = imagePreview_el.text
372         else:
373             thumbnail = None
374         info = {
375             'id': video_id,
376             'url': video_url,
377             'title': title,
378             'ext': extension,
379             'format': format,
380             'thumbnail': thumbnail,
381             'description': description
382         }
383         return [info]
384
385 class SpiegelIE(InfoExtractor):
386     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
387
388     def _real_extract(self, url):
389         m = re.match(self._VALID_URL, url)
390         video_id = m.group('videoID')
391
392         webpage = self._download_webpage(url, video_id)
393
394         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
395             webpage, u'title')
396
397         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
398         xml_code = self._download_webpage(xml_url, video_id,
399                     note=u'Downloading XML', errnote=u'Failed to download XML')
400
401         idoc = xml.etree.ElementTree.fromstring(xml_code)
402         last_type = idoc[-1]
403         filename = last_type.findall('./filename')[0].text
404         duration = float(last_type.findall('./duration')[0].text)
405
406         video_url = 'http://video2.spiegel.de/flash/' + filename
407         video_ext = filename.rpartition('.')[2]
408         info = {
409             'id': video_id,
410             'url': video_url,
411             'ext': video_ext,
412             'title': video_title,
413             'duration': duration,
414         }
415         return [info]
416
417 class LiveLeakIE(InfoExtractor):
418
419     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
420     IE_NAME = u'liveleak'
421
422     def _real_extract(self, url):
423         mobj = re.match(self._VALID_URL, url)
424         if mobj is None:
425             raise ExtractorError(u'Invalid URL: %s' % url)
426
427         video_id = mobj.group('video_id')
428
429         webpage = self._download_webpage(url, video_id)
430
431         video_url = self._search_regex(r'file: "(.*?)",',
432             webpage, u'video URL')
433
434         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
435             webpage, u'title').replace('LiveLeak.com -', '').strip()
436
437         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
438             webpage, u'description', fatal=False)
439
440         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
441             webpage, u'uploader', fatal=False)
442
443         info = {
444             'id':  video_id,
445             'url': video_url,
446             'ext': 'mp4',
447             'title': video_title,
448             'description': video_description,
449             'uploader': video_uploader
450         }
451
452         return [info]
453
454
455
456 class TumblrIE(InfoExtractor):
457     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
458
459     def _real_extract(self, url):
460         m_url = re.match(self._VALID_URL, url)
461         video_id = m_url.group('id')
462         blog = m_url.group('blog_name')
463
464         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
465         webpage = self._download_webpage(url, video_id)
466
467         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
468         video = re.search(re_video, webpage)
469         if video is None:
470            raise ExtractorError(u'Unable to extract video')
471         video_url = video.group('video_url')
472         ext = video.group('ext')
473
474         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
475             webpage, u'thumbnail', fatal=False)  # We pick the first poster
476         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
477
478         # The only place where you can get a title, it's not complete,
479         # but searching in other places doesn't work for all videos
480         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
481             webpage, u'title', flags=re.DOTALL)
482
483         return [{'id': video_id,
484                  'url': video_url,
485                  'title': video_title,
486                  'thumbnail': video_thumbnail,
487                  'ext': ext
488                  }]
489
490 class BandcampIE(InfoExtractor):
491     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
492
493     def _real_extract(self, url):
494         mobj = re.match(self._VALID_URL, url)
495         title = mobj.group('title')
496         webpage = self._download_webpage(url, title)
497         # We get the link to the free download page
498         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
499         if m_download is None:
500             raise ExtractorError(u'No free songs found')
501
502         download_link = m_download.group(1)
503         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
504                        webpage, re.MULTILINE|re.DOTALL).group('id')
505
506         download_webpage = self._download_webpage(download_link, id,
507                                                   'Downloading free downloads page')
508         # We get the dictionary of the track from some javascrip code
509         info = re.search(r'items: (.*?),$',
510                          download_webpage, re.MULTILINE).group(1)
511         info = json.loads(info)[0]
512         # We pick mp3-320 for now, until format selection can be easily implemented.
513         mp3_info = info[u'downloads'][u'mp3-320']
514         # If we try to use this url it says the link has expired
515         initial_url = mp3_info[u'url']
516         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
517         m_url = re.match(re_url, initial_url)
518         #We build the url we will use to get the final track url
519         # This url is build in Bandcamp in the script download_bunde_*.js
520         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
521         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
522         # If we could correctly generate the .rand field the url would be
523         #in the "download_url" key
524         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
525
526         track_info = {'id':id,
527                       'title' : info[u'title'],
528                       'ext' :   'mp3',
529                       'url' :   final_url,
530                       'thumbnail' : info[u'thumb_url'],
531                       'uploader' :  info[u'artist']
532                       }
533
534         return [track_info]
535
536 class RedTubeIE(InfoExtractor):
537     """Information Extractor for redtube"""
538     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
539
540     def _real_extract(self,url):
541         mobj = re.match(self._VALID_URL, url)
542         if mobj is None:
543             raise ExtractorError(u'Invalid URL: %s' % url)
544
545         video_id = mobj.group('id')
546         video_extension = 'mp4'        
547         webpage = self._download_webpage(url, video_id)
548
549         self.report_extraction(video_id)
550
551         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
552             webpage, u'video URL')
553
554         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
555             webpage, u'title')
556
557         return [{
558             'id':       video_id,
559             'url':      video_url,
560             'ext':      video_extension,
561             'title':    video_title,
562         }]
563         
564 class InaIE(InfoExtractor):
565     """Information Extractor for Ina.fr"""
566     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
567
568     def _real_extract(self,url):
569         mobj = re.match(self._VALID_URL, url)
570
571         video_id = mobj.group('id')
572         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
573         video_extension = 'mp4'
574         webpage = self._download_webpage(mrss_url, video_id)
575
576         self.report_extraction(video_id)
577
578         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
579             webpage, u'video URL')
580
581         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
582             webpage, u'title')
583
584         return [{
585             'id':       video_id,
586             'url':      video_url,
587             'ext':      video_extension,
588             'title':    video_title,
589         }]
590
591 class HowcastIE(InfoExtractor):
592     """Information Extractor for Howcast.com"""
593     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
594
595     def _real_extract(self, url):
596         mobj = re.match(self._VALID_URL, url)
597
598         video_id = mobj.group('id')
599         webpage_url = 'http://www.howcast.com/videos/' + video_id
600         webpage = self._download_webpage(webpage_url, video_id)
601
602         self.report_extraction(video_id)
603
604         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
605             webpage, u'video URL')
606
607         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
608             webpage, u'title')
609
610         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
611             webpage, u'description', fatal=False)
612
613         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
614             webpage, u'thumbnail', fatal=False)
615
616         return [{
617             'id':       video_id,
618             'url':      video_url,
619             'ext':      'mp4',
620             'title':    video_title,
621             'description': video_description,
622             'thumbnail': thumbnail,
623         }]
624
625 class VineIE(InfoExtractor):
626     """Information Extractor for Vine.co"""
627     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
628
629     def _real_extract(self, url):
630         mobj = re.match(self._VALID_URL, url)
631
632         video_id = mobj.group('id')
633         webpage_url = 'https://vine.co/v/' + video_id
634         webpage = self._download_webpage(webpage_url, video_id)
635
636         self.report_extraction(video_id)
637
638         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
639             webpage, u'video URL')
640
641         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
642             webpage, u'title')
643
644         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
645             webpage, u'thumbnail', fatal=False)
646
647         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
648             webpage, u'uploader', fatal=False, flags=re.DOTALL)
649
650         return [{
651             'id':        video_id,
652             'url':       video_url,
653             'ext':       'mp4',
654             'title':     video_title,
655             'thumbnail': thumbnail,
656             'uploader':  uploader,
657         }]
658
659 class FlickrIE(InfoExtractor):
660     """Information Extractor for Flickr videos"""
661     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
662
663     def _real_extract(self, url):
664         mobj = re.match(self._VALID_URL, url)
665
666         video_id = mobj.group('id')
667         video_uploader_id = mobj.group('uploader_id')
668         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
669         webpage = self._download_webpage(webpage_url, video_id)
670
671         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
672
673         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
674         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
675
676         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
677             first_xml, u'node_id')
678
679         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
680         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
681
682         self.report_extraction(video_id)
683
684         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
685         if mobj is None:
686             raise ExtractorError(u'Unable to extract video url')
687         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
688
689         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
690             webpage, u'video title')
691
692         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
693             webpage, u'description', fatal=False)
694
695         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
696             webpage, u'thumbnail', fatal=False)
697
698         return [{
699             'id':          video_id,
700             'url':         video_url,
701             'ext':         'mp4',
702             'title':       video_title,
703             'description': video_description,
704             'thumbnail':   thumbnail,
705             'uploader_id': video_uploader_id,
706         }]
707
708 class TeamcocoIE(InfoExtractor):
709     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
710
711     def _real_extract(self, url):
712         mobj = re.match(self._VALID_URL, url)
713         if mobj is None:
714             raise ExtractorError(u'Invalid URL: %s' % url)
715         url_title = mobj.group('url_title')
716         webpage = self._download_webpage(url, url_title)
717
718         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
719             webpage, u'video id')
720
721         self.report_extraction(video_id)
722
723         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
724             webpage, u'title')
725
726         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
727             webpage, u'thumbnail', fatal=False)
728
729         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
730             webpage, u'description', fatal=False)
731
732         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
733         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
734
735         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
736             data, u'video URL')
737
738         return [{
739             'id':          video_id,
740             'url':         video_url,
741             'ext':         'mp4',
742             'title':       video_title,
743             'thumbnail':   thumbnail,
744             'description': video_description,
745         }]
746
747 class XHamsterIE(InfoExtractor):
748     """Information Extractor for xHamster"""
749     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
750
751     def _real_extract(self,url):
752         mobj = re.match(self._VALID_URL, url)
753
754         video_id = mobj.group('id')
755         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
756         webpage = self._download_webpage(mrss_url, video_id)
757
758         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
759         if mobj is None:
760             raise ExtractorError(u'Unable to extract media URL')
761         if len(mobj.group('server')) == 0:
762             video_url = compat_urllib_parse.unquote(mobj.group('file'))
763         else:
764             video_url = mobj.group('server')+'/key='+mobj.group('file')
765         video_extension = video_url.split('.')[-1]
766
767         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
768             webpage, u'title')
769
770         # Can't see the description anywhere in the UI
771         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
772         #     webpage, u'description', fatal=False)
773         # if video_description: video_description = unescapeHTML(video_description)
774
775         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
776         if mobj:
777             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
778         else:
779             video_upload_date = None
780             self._downloader.report_warning(u'Unable to extract upload date')
781
782         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
783             webpage, u'uploader id', default=u'anonymous')
784
785         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
786             webpage, u'thumbnail', fatal=False)
787
788         return [{
789             'id':       video_id,
790             'url':      video_url,
791             'ext':      video_extension,
792             'title':    video_title,
793             # 'description': video_description,
794             'upload_date': video_upload_date,
795             'uploader_id': video_uploader_id,
796             'thumbnail': video_thumbnail
797         }]
798
799 class HypemIE(InfoExtractor):
800     """Information Extractor for hypem"""
801     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
802
803     def _real_extract(self, url):
804         mobj = re.match(self._VALID_URL, url)
805         if mobj is None:
806             raise ExtractorError(u'Invalid URL: %s' % url)
807         track_id = mobj.group(1)
808
809         data = { 'ax': 1, 'ts': time.time() }
810         data_encoded = compat_urllib_parse.urlencode(data)
811         complete_url = url + "?" + data_encoded
812         request = compat_urllib_request.Request(complete_url)
813         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
814         cookie = urlh.headers.get('Set-Cookie', '')
815
816         self.report_extraction(track_id)
817
818         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
819             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
820         try:
821             track_list = json.loads(html_tracks)
822             track = track_list[u'tracks'][0]
823         except ValueError:
824             raise ExtractorError(u'Hypemachine contained invalid JSON.')
825
826         key = track[u"key"]
827         track_id = track[u"id"]
828         artist = track[u"artist"]
829         title = track[u"song"]
830
831         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
832         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
833         request.add_header('cookie', cookie)
834         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
835         try:
836             song_data = json.loads(song_data_json)
837         except ValueError:
838             raise ExtractorError(u'Hypemachine contained invalid JSON.')
839         final_url = song_data[u"url"]
840
841         return [{
842             'id':       track_id,
843             'url':      final_url,
844             'ext':      "mp3",
845             'title':    title,
846             'artist':   artist,
847         }]
848
849 class Vbox7IE(InfoExtractor):
850     """Information Extractor for Vbox7"""
851     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
852
853     def _real_extract(self,url):
854         mobj = re.match(self._VALID_URL, url)
855         if mobj is None:
856             raise ExtractorError(u'Invalid URL: %s' % url)
857         video_id = mobj.group(1)
858
859         redirect_page, urlh = self._download_webpage_handle(url, video_id)
860         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
861         redirect_url = urlh.geturl() + new_location
862         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
863
864         title = self._html_search_regex(r'<title>(.*)</title>',
865             webpage, u'title').split('/')[0].strip()
866
867         ext = "flv"
868         info_url = "http://vbox7.com/play/magare.do"
869         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
870         info_request = compat_urllib_request.Request(info_url, data)
871         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
872         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
873         if info_response is None:
874             raise ExtractorError(u'Unable to extract the media url')
875         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
876
877         return [{
878             'id':        video_id,
879             'url':       final_url,
880             'ext':       ext,
881             'title':     title,
882             'thumbnail': thumbnail_url,
883         }]
884
885
886 def gen_extractors():
887     """ Return a list of an instance of every supported extractor.
888     The order does matter; the first extractor matched is the one handling the URL.
889     """
890     return [
891         YoutubePlaylistIE(),
892         YoutubeChannelIE(),
893         YoutubeUserIE(),
894         YoutubeSearchIE(),
895         YoutubeIE(),
896         MetacafeIE(),
897         DailymotionIE(),
898         GoogleSearchIE(),
899         PhotobucketIE(),
900         YahooIE(),
901         YahooSearchIE(),
902         DepositFilesIE(),
903         FacebookIE(),
904         BlipTVIE(),
905         BlipTVUserIE(),
906         VimeoIE(),
907         MyVideoIE(),
908         ComedyCentralIE(),
909         EscapistIE(),
910         CollegeHumorIE(),
911         XVideosIE(),
912         SoundcloudSetIE(),
913         SoundcloudIE(),
914         InfoQIE(),
915         MixcloudIE(),
916         StanfordOpenClassroomIE(),
917         MTVIE(),
918         YoukuIE(),
919         XNXXIE(),
920         YouJizzIE(),
921         PornotubeIE(),
922         YouPornIE(),
923         GooglePlusIE(),
924         ArteTvIE(),
925         NBAIE(),
926         WorldStarHipHopIE(),
927         JustinTVIE(),
928         FunnyOrDieIE(),
929         SteamIE(),
930         UstreamIE(),
931         RBMARadioIE(),
932         EightTracksIE(),
933         KeekIE(),
934         TEDIE(),
935         MySpassIE(),
936         SpiegelIE(),
937         LiveLeakIE(),
938         ARDIE(),
939         ZDFIE(),
940         TumblrIE(),
941         BandcampIE(),
942         RedTubeIE(),
943         InaIE(),
944         HowcastIE(),
945         VineIE(),
946         FlickrIE(),
947         TeamcocoIE(),
948         XHamsterIE(),
949         HypemIE(),
950         Vbox7IE(),
951         GametrailersIE(),
952         StatigramIE(),
953         GenericIE()
954     ]
955
956 def get_info_extractor(ie_name):
957     """Returns the info extractor class with the given ie_name"""
958     return globals()[ie_name+'IE']