[ustream] move into its own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.funnyordie import FunnyOrDieIE
31 from .extractor.gametrailers import GametrailersIE
32 from .extractor.generic import GenericIE
33 from .extractor.googleplus import GooglePlusIE
34 from .extractor.googlesearch import GoogleSearchIE
35 from .extractor.infoq import InfoQIE
36 from .extractor.justintv import JustinTVIE
37 from .extractor.metacafe import MetacafeIE
38 from .extractor.mixcloud import MixcloudIE
39 from .extractor.mtv import MTVIE
40 from .extractor.myvideo import MyVideoIE
41 from .extractor.nba import NBAIE
42 from .extractor.statigram import StatigramIE
43 from .extractor.photobucket import PhotobucketIE
44 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
45 from .extractor.stanfordoc import StanfordOpenClassroomIE
46 from .extractor.steam import SteamIE
47 from .extractor.ted import TEDIE
48 from .extractor.ustream import UstreamIE
49 from .extractor.vimeo import VimeoIE
50 from .extractor.worldstarhiphop import WorldStarHipHopIE
51 from .extractor.xnxx import XNXXIE
52 from .extractor.xvideos import XVideosIE
53 from .extractor.yahoo import YahooIE, YahooSearchIE
54 from .extractor.youku import YoukuIE
55 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
56 from .extractor.zdf import ZDFIE
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75 class RBMARadioIE(InfoExtractor):
76     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
77
78     def _real_extract(self, url):
79         m = re.match(self._VALID_URL, url)
80         video_id = m.group('videoID')
81
82         webpage = self._download_webpage(url, video_id)
83
84         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
85             webpage, u'json data', flags=re.MULTILINE)
86
87         try:
88             data = json.loads(json_data)
89         except ValueError as e:
90             raise ExtractorError(u'Invalid JSON: ' + str(e))
91
92         video_url = data['akamai_url'] + '&cbr=256'
93         url_parts = compat_urllib_parse_urlparse(video_url)
94         video_ext = url_parts.path.rpartition('.')[2]
95         info = {
96                 'id': video_id,
97                 'url': video_url,
98                 'ext': video_ext,
99                 'title': data['title'],
100                 'description': data.get('teaser_text'),
101                 'location': data.get('country_of_origin'),
102                 'uploader': data.get('host', {}).get('name'),
103                 'uploader_id': data.get('host', {}).get('slug'),
104                 'thumbnail': data.get('image', {}).get('large_url_2x'),
105                 'duration': data.get('duration'),
106         }
107         return [info]
108
109
110 class YouPornIE(InfoExtractor):
111     """Information extractor for youporn.com."""
112     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
113
114     def _print_formats(self, formats):
115         """Print all available formats"""
116         print(u'Available formats:')
117         print(u'ext\t\tformat')
118         print(u'---------------------------------')
119         for format in formats:
120             print(u'%s\t\t%s'  % (format['ext'], format['format']))
121
122     def _specific(self, req_format, formats):
123         for x in formats:
124             if(x["format"]==req_format):
125                 return x
126         return None
127
128     def _real_extract(self, url):
129         mobj = re.match(self._VALID_URL, url)
130         if mobj is None:
131             raise ExtractorError(u'Invalid URL: %s' % url)
132         video_id = mobj.group('videoid')
133
134         req = compat_urllib_request.Request(url)
135         req.add_header('Cookie', 'age_verified=1')
136         webpage = self._download_webpage(req, video_id)
137
138         # Get JSON parameters
139         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
140         try:
141             params = json.loads(json_params)
142         except:
143             raise ExtractorError(u'Invalid JSON')
144
145         self.report_extraction(video_id)
146         try:
147             video_title = params['title']
148             upload_date = unified_strdate(params['release_date_f'])
149             video_description = params['description']
150             video_uploader = params['submitted_by']
151             thumbnail = params['thumbnails'][0]['image']
152         except KeyError:
153             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
154
155         # Get all of the formats available
156         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
157         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
158             webpage, u'download list').strip()
159
160         # Get all of the links from the page
161         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
162         links = re.findall(LINK_RE, download_list_html)
163         if(len(links) == 0):
164             raise ExtractorError(u'ERROR: no known formats available for video')
165
166         self.to_screen(u'Links found: %d' % len(links))
167
168         formats = []
169         for link in links:
170
171             # A link looks like this:
172             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
173             # A path looks like this:
174             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
175             video_url = unescapeHTML( link )
176             path = compat_urllib_parse_urlparse( video_url ).path
177             extension = os.path.splitext( path )[1][1:]
178             format = path.split('/')[4].split('_')[:2]
179             size = format[0]
180             bitrate = format[1]
181             format = "-".join( format )
182             # title = u'%s-%s-%s' % (video_title, size, bitrate)
183
184             formats.append({
185                 'id': video_id,
186                 'url': video_url,
187                 'uploader': video_uploader,
188                 'upload_date': upload_date,
189                 'title': video_title,
190                 'ext': extension,
191                 'format': format,
192                 'thumbnail': thumbnail,
193                 'description': video_description
194             })
195
196         if self._downloader.params.get('listformats', None):
197             self._print_formats(formats)
198             return
199
200         req_format = self._downloader.params.get('format', None)
201         self.to_screen(u'Format: %s' % req_format)
202
203         if req_format is None or req_format == 'best':
204             return [formats[0]]
205         elif req_format == 'worst':
206             return [formats[-1]]
207         elif req_format in ('-1', 'all'):
208             return formats
209         else:
210             format = self._specific( req_format, formats )
211             if result is None:
212                 raise ExtractorError(u'Requested format not available')
213             return [format]
214
215
216
217 class PornotubeIE(InfoExtractor):
218     """Information extractor for pornotube.com."""
219     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
220
221     def _real_extract(self, url):
222         mobj = re.match(self._VALID_URL, url)
223         if mobj is None:
224             raise ExtractorError(u'Invalid URL: %s' % url)
225
226         video_id = mobj.group('videoid')
227         video_title = mobj.group('title')
228
229         # Get webpage content
230         webpage = self._download_webpage(url, video_id)
231
232         # Get the video URL
233         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
234         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
235         video_url = compat_urllib_parse.unquote(video_url)
236
237         #Get the uploaded date
238         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
239         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
240         if upload_date: upload_date = unified_strdate(upload_date)
241
242         info = {'id': video_id,
243                 'url': video_url,
244                 'uploader': None,
245                 'upload_date': upload_date,
246                 'title': video_title,
247                 'ext': 'flv',
248                 'format': 'flv'}
249
250         return [info]
251
252 class YouJizzIE(InfoExtractor):
253     """Information extractor for youjizz.com."""
254     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
255
256     def _real_extract(self, url):
257         mobj = re.match(self._VALID_URL, url)
258         if mobj is None:
259             raise ExtractorError(u'Invalid URL: %s' % url)
260
261         video_id = mobj.group('videoid')
262
263         # Get webpage content
264         webpage = self._download_webpage(url, video_id)
265
266         # Get the video title
267         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
268             webpage, u'title').strip()
269
270         # Get the embed page
271         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
272         if result is None:
273             raise ExtractorError(u'ERROR: unable to extract embed page')
274
275         embed_page_url = result.group(0).strip()
276         video_id = result.group('videoid')
277
278         webpage = self._download_webpage(embed_page_url, video_id)
279
280         # Get the video URL
281         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
282             webpage, u'video URL')
283
284         info = {'id': video_id,
285                 'url': video_url,
286                 'title': video_title,
287                 'ext': 'flv',
288                 'format': 'flv',
289                 'player_url': embed_page_url}
290
291         return [info]
292
293 class EightTracksIE(InfoExtractor):
294     IE_NAME = '8tracks'
295     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
296
297     def _real_extract(self, url):
298         mobj = re.match(self._VALID_URL, url)
299         if mobj is None:
300             raise ExtractorError(u'Invalid URL: %s' % url)
301         playlist_id = mobj.group('id')
302
303         webpage = self._download_webpage(url, playlist_id)
304
305         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
306         data = json.loads(json_like)
307
308         session = str(random.randint(0, 1000000000))
309         mix_id = data['id']
310         track_count = data['tracks_count']
311         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
312         next_url = first_url
313         res = []
314         for i in itertools.count():
315             api_json = self._download_webpage(next_url, playlist_id,
316                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
317                 errnote=u'Failed to download song information')
318             api_data = json.loads(api_json)
319             track_data = api_data[u'set']['track']
320             info = {
321                 'id': track_data['id'],
322                 'url': track_data['track_file_stream_url'],
323                 'title': track_data['performer'] + u' - ' + track_data['name'],
324                 'raw_title': track_data['name'],
325                 'uploader_id': data['user']['login'],
326                 'ext': 'm4a',
327             }
328             res.append(info)
329             if api_data['set']['at_last_track']:
330                 break
331             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
332         return res
333
334 class KeekIE(InfoExtractor):
335     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
336     IE_NAME = u'keek'
337
338     def _real_extract(self, url):
339         m = re.match(self._VALID_URL, url)
340         video_id = m.group('videoID')
341
342         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
343         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
344         webpage = self._download_webpage(url, video_id)
345
346         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
347             webpage, u'title')
348
349         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
350             webpage, u'uploader', fatal=False)
351
352         info = {
353                 'id': video_id,
354                 'url': video_url,
355                 'ext': 'mp4',
356                 'title': video_title,
357                 'thumbnail': thumbnail,
358                 'uploader': uploader
359         }
360         return [info]
361
362
363 class MySpassIE(InfoExtractor):
364     _VALID_URL = r'http://www.myspass.de/.*'
365
366     def _real_extract(self, url):
367         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
368
369         # video id is the last path element of the URL
370         # usually there is a trailing slash, so also try the second but last
371         url_path = compat_urllib_parse_urlparse(url).path
372         url_parent_path, video_id = os.path.split(url_path)
373         if not video_id:
374             _, video_id = os.path.split(url_parent_path)
375
376         # get metadata
377         metadata_url = META_DATA_URL_TEMPLATE % video_id
378         metadata_text = self._download_webpage(metadata_url, video_id)
379         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
380
381         # extract values from metadata
382         url_flv_el = metadata.find('url_flv')
383         if url_flv_el is None:
384             raise ExtractorError(u'Unable to extract download url')
385         video_url = url_flv_el.text
386         extension = os.path.splitext(video_url)[1][1:]
387         title_el = metadata.find('title')
388         if title_el is None:
389             raise ExtractorError(u'Unable to extract title')
390         title = title_el.text
391         format_id_el = metadata.find('format_id')
392         if format_id_el is None:
393             format = ext
394         else:
395             format = format_id_el.text
396         description_el = metadata.find('description')
397         if description_el is not None:
398             description = description_el.text
399         else:
400             description = None
401         imagePreview_el = metadata.find('imagePreview')
402         if imagePreview_el is not None:
403             thumbnail = imagePreview_el.text
404         else:
405             thumbnail = None
406         info = {
407             'id': video_id,
408             'url': video_url,
409             'title': title,
410             'ext': extension,
411             'format': format,
412             'thumbnail': thumbnail,
413             'description': description
414         }
415         return [info]
416
417 class SpiegelIE(InfoExtractor):
418     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
419
420     def _real_extract(self, url):
421         m = re.match(self._VALID_URL, url)
422         video_id = m.group('videoID')
423
424         webpage = self._download_webpage(url, video_id)
425
426         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
427             webpage, u'title')
428
429         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
430         xml_code = self._download_webpage(xml_url, video_id,
431                     note=u'Downloading XML', errnote=u'Failed to download XML')
432
433         idoc = xml.etree.ElementTree.fromstring(xml_code)
434         last_type = idoc[-1]
435         filename = last_type.findall('./filename')[0].text
436         duration = float(last_type.findall('./duration')[0].text)
437
438         video_url = 'http://video2.spiegel.de/flash/' + filename
439         video_ext = filename.rpartition('.')[2]
440         info = {
441             'id': video_id,
442             'url': video_url,
443             'ext': video_ext,
444             'title': video_title,
445             'duration': duration,
446         }
447         return [info]
448
449 class LiveLeakIE(InfoExtractor):
450
451     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
452     IE_NAME = u'liveleak'
453
454     def _real_extract(self, url):
455         mobj = re.match(self._VALID_URL, url)
456         if mobj is None:
457             raise ExtractorError(u'Invalid URL: %s' % url)
458
459         video_id = mobj.group('video_id')
460
461         webpage = self._download_webpage(url, video_id)
462
463         video_url = self._search_regex(r'file: "(.*?)",',
464             webpage, u'video URL')
465
466         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
467             webpage, u'title').replace('LiveLeak.com -', '').strip()
468
469         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
470             webpage, u'description', fatal=False)
471
472         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
473             webpage, u'uploader', fatal=False)
474
475         info = {
476             'id':  video_id,
477             'url': video_url,
478             'ext': 'mp4',
479             'title': video_title,
480             'description': video_description,
481             'uploader': video_uploader
482         }
483
484         return [info]
485
486
487
488 class TumblrIE(InfoExtractor):
489     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
490
491     def _real_extract(self, url):
492         m_url = re.match(self._VALID_URL, url)
493         video_id = m_url.group('id')
494         blog = m_url.group('blog_name')
495
496         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
497         webpage = self._download_webpage(url, video_id)
498
499         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
500         video = re.search(re_video, webpage)
501         if video is None:
502            raise ExtractorError(u'Unable to extract video')
503         video_url = video.group('video_url')
504         ext = video.group('ext')
505
506         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
507             webpage, u'thumbnail', fatal=False)  # We pick the first poster
508         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
509
510         # The only place where you can get a title, it's not complete,
511         # but searching in other places doesn't work for all videos
512         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
513             webpage, u'title', flags=re.DOTALL)
514
515         return [{'id': video_id,
516                  'url': video_url,
517                  'title': video_title,
518                  'thumbnail': video_thumbnail,
519                  'ext': ext
520                  }]
521
522 class BandcampIE(InfoExtractor):
523     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
524
525     def _real_extract(self, url):
526         mobj = re.match(self._VALID_URL, url)
527         title = mobj.group('title')
528         webpage = self._download_webpage(url, title)
529         # We get the link to the free download page
530         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
531         if m_download is None:
532             raise ExtractorError(u'No free songs found')
533
534         download_link = m_download.group(1)
535         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
536                        webpage, re.MULTILINE|re.DOTALL).group('id')
537
538         download_webpage = self._download_webpage(download_link, id,
539                                                   'Downloading free downloads page')
540         # We get the dictionary of the track from some javascrip code
541         info = re.search(r'items: (.*?),$',
542                          download_webpage, re.MULTILINE).group(1)
543         info = json.loads(info)[0]
544         # We pick mp3-320 for now, until format selection can be easily implemented.
545         mp3_info = info[u'downloads'][u'mp3-320']
546         # If we try to use this url it says the link has expired
547         initial_url = mp3_info[u'url']
548         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
549         m_url = re.match(re_url, initial_url)
550         #We build the url we will use to get the final track url
551         # This url is build in Bandcamp in the script download_bunde_*.js
552         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
553         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
554         # If we could correctly generate the .rand field the url would be
555         #in the "download_url" key
556         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
557
558         track_info = {'id':id,
559                       'title' : info[u'title'],
560                       'ext' :   'mp3',
561                       'url' :   final_url,
562                       'thumbnail' : info[u'thumb_url'],
563                       'uploader' :  info[u'artist']
564                       }
565
566         return [track_info]
567
568 class RedTubeIE(InfoExtractor):
569     """Information Extractor for redtube"""
570     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
571
572     def _real_extract(self,url):
573         mobj = re.match(self._VALID_URL, url)
574         if mobj is None:
575             raise ExtractorError(u'Invalid URL: %s' % url)
576
577         video_id = mobj.group('id')
578         video_extension = 'mp4'        
579         webpage = self._download_webpage(url, video_id)
580
581         self.report_extraction(video_id)
582
583         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
584             webpage, u'video URL')
585
586         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
587             webpage, u'title')
588
589         return [{
590             'id':       video_id,
591             'url':      video_url,
592             'ext':      video_extension,
593             'title':    video_title,
594         }]
595         
596 class InaIE(InfoExtractor):
597     """Information Extractor for Ina.fr"""
598     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
599
600     def _real_extract(self,url):
601         mobj = re.match(self._VALID_URL, url)
602
603         video_id = mobj.group('id')
604         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
605         video_extension = 'mp4'
606         webpage = self._download_webpage(mrss_url, video_id)
607
608         self.report_extraction(video_id)
609
610         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
611             webpage, u'video URL')
612
613         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
614             webpage, u'title')
615
616         return [{
617             'id':       video_id,
618             'url':      video_url,
619             'ext':      video_extension,
620             'title':    video_title,
621         }]
622
623 class HowcastIE(InfoExtractor):
624     """Information Extractor for Howcast.com"""
625     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
626
627     def _real_extract(self, url):
628         mobj = re.match(self._VALID_URL, url)
629
630         video_id = mobj.group('id')
631         webpage_url = 'http://www.howcast.com/videos/' + video_id
632         webpage = self._download_webpage(webpage_url, video_id)
633
634         self.report_extraction(video_id)
635
636         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
637             webpage, u'video URL')
638
639         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
640             webpage, u'title')
641
642         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
643             webpage, u'description', fatal=False)
644
645         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
646             webpage, u'thumbnail', fatal=False)
647
648         return [{
649             'id':       video_id,
650             'url':      video_url,
651             'ext':      'mp4',
652             'title':    video_title,
653             'description': video_description,
654             'thumbnail': thumbnail,
655         }]
656
657 class VineIE(InfoExtractor):
658     """Information Extractor for Vine.co"""
659     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
660
661     def _real_extract(self, url):
662         mobj = re.match(self._VALID_URL, url)
663
664         video_id = mobj.group('id')
665         webpage_url = 'https://vine.co/v/' + video_id
666         webpage = self._download_webpage(webpage_url, video_id)
667
668         self.report_extraction(video_id)
669
670         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
671             webpage, u'video URL')
672
673         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
674             webpage, u'title')
675
676         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
677             webpage, u'thumbnail', fatal=False)
678
679         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
680             webpage, u'uploader', fatal=False, flags=re.DOTALL)
681
682         return [{
683             'id':        video_id,
684             'url':       video_url,
685             'ext':       'mp4',
686             'title':     video_title,
687             'thumbnail': thumbnail,
688             'uploader':  uploader,
689         }]
690
691 class FlickrIE(InfoExtractor):
692     """Information Extractor for Flickr videos"""
693     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
694
695     def _real_extract(self, url):
696         mobj = re.match(self._VALID_URL, url)
697
698         video_id = mobj.group('id')
699         video_uploader_id = mobj.group('uploader_id')
700         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
701         webpage = self._download_webpage(webpage_url, video_id)
702
703         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
704
705         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
706         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
707
708         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
709             first_xml, u'node_id')
710
711         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
712         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
713
714         self.report_extraction(video_id)
715
716         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
717         if mobj is None:
718             raise ExtractorError(u'Unable to extract video url')
719         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
720
721         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
722             webpage, u'video title')
723
724         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
725             webpage, u'description', fatal=False)
726
727         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
728             webpage, u'thumbnail', fatal=False)
729
730         return [{
731             'id':          video_id,
732             'url':         video_url,
733             'ext':         'mp4',
734             'title':       video_title,
735             'description': video_description,
736             'thumbnail':   thumbnail,
737             'uploader_id': video_uploader_id,
738         }]
739
740 class TeamcocoIE(InfoExtractor):
741     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
742
743     def _real_extract(self, url):
744         mobj = re.match(self._VALID_URL, url)
745         if mobj is None:
746             raise ExtractorError(u'Invalid URL: %s' % url)
747         url_title = mobj.group('url_title')
748         webpage = self._download_webpage(url, url_title)
749
750         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
751             webpage, u'video id')
752
753         self.report_extraction(video_id)
754
755         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
756             webpage, u'title')
757
758         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
759             webpage, u'thumbnail', fatal=False)
760
761         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
762             webpage, u'description', fatal=False)
763
764         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
765         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
766
767         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
768             data, u'video URL')
769
770         return [{
771             'id':          video_id,
772             'url':         video_url,
773             'ext':         'mp4',
774             'title':       video_title,
775             'thumbnail':   thumbnail,
776             'description': video_description,
777         }]
778
779 class XHamsterIE(InfoExtractor):
780     """Information Extractor for xHamster"""
781     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
782
783     def _real_extract(self,url):
784         mobj = re.match(self._VALID_URL, url)
785
786         video_id = mobj.group('id')
787         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
788         webpage = self._download_webpage(mrss_url, video_id)
789
790         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
791         if mobj is None:
792             raise ExtractorError(u'Unable to extract media URL')
793         if len(mobj.group('server')) == 0:
794             video_url = compat_urllib_parse.unquote(mobj.group('file'))
795         else:
796             video_url = mobj.group('server')+'/key='+mobj.group('file')
797         video_extension = video_url.split('.')[-1]
798
799         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
800             webpage, u'title')
801
802         # Can't see the description anywhere in the UI
803         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
804         #     webpage, u'description', fatal=False)
805         # if video_description: video_description = unescapeHTML(video_description)
806
807         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
808         if mobj:
809             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
810         else:
811             video_upload_date = None
812             self._downloader.report_warning(u'Unable to extract upload date')
813
814         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
815             webpage, u'uploader id', default=u'anonymous')
816
817         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
818             webpage, u'thumbnail', fatal=False)
819
820         return [{
821             'id':       video_id,
822             'url':      video_url,
823             'ext':      video_extension,
824             'title':    video_title,
825             # 'description': video_description,
826             'upload_date': video_upload_date,
827             'uploader_id': video_uploader_id,
828             'thumbnail': video_thumbnail
829         }]
830
831 class HypemIE(InfoExtractor):
832     """Information Extractor for hypem"""
833     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
834
835     def _real_extract(self, url):
836         mobj = re.match(self._VALID_URL, url)
837         if mobj is None:
838             raise ExtractorError(u'Invalid URL: %s' % url)
839         track_id = mobj.group(1)
840
841         data = { 'ax': 1, 'ts': time.time() }
842         data_encoded = compat_urllib_parse.urlencode(data)
843         complete_url = url + "?" + data_encoded
844         request = compat_urllib_request.Request(complete_url)
845         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
846         cookie = urlh.headers.get('Set-Cookie', '')
847
848         self.report_extraction(track_id)
849
850         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
851             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
852         try:
853             track_list = json.loads(html_tracks)
854             track = track_list[u'tracks'][0]
855         except ValueError:
856             raise ExtractorError(u'Hypemachine contained invalid JSON.')
857
858         key = track[u"key"]
859         track_id = track[u"id"]
860         artist = track[u"artist"]
861         title = track[u"song"]
862
863         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
864         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
865         request.add_header('cookie', cookie)
866         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
867         try:
868             song_data = json.loads(song_data_json)
869         except ValueError:
870             raise ExtractorError(u'Hypemachine contained invalid JSON.')
871         final_url = song_data[u"url"]
872
873         return [{
874             'id':       track_id,
875             'url':      final_url,
876             'ext':      "mp3",
877             'title':    title,
878             'artist':   artist,
879         }]
880
881 class Vbox7IE(InfoExtractor):
882     """Information Extractor for Vbox7"""
883     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
884
885     def _real_extract(self,url):
886         mobj = re.match(self._VALID_URL, url)
887         if mobj is None:
888             raise ExtractorError(u'Invalid URL: %s' % url)
889         video_id = mobj.group(1)
890
891         redirect_page, urlh = self._download_webpage_handle(url, video_id)
892         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
893         redirect_url = urlh.geturl() + new_location
894         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
895
896         title = self._html_search_regex(r'<title>(.*)</title>',
897             webpage, u'title').split('/')[0].strip()
898
899         ext = "flv"
900         info_url = "http://vbox7.com/play/magare.do"
901         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
902         info_request = compat_urllib_request.Request(info_url, data)
903         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
904         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
905         if info_response is None:
906             raise ExtractorError(u'Unable to extract the media url')
907         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
908
909         return [{
910             'id':        video_id,
911             'url':       final_url,
912             'ext':       ext,
913             'title':     title,
914             'thumbnail': thumbnail_url,
915         }]
916
917
918 def gen_extractors():
919     """ Return a list of an instance of every supported extractor.
920     The order does matter; the first extractor matched is the one handling the URL.
921     """
922     return [
923         YoutubePlaylistIE(),
924         YoutubeChannelIE(),
925         YoutubeUserIE(),
926         YoutubeSearchIE(),
927         YoutubeIE(),
928         MetacafeIE(),
929         DailymotionIE(),
930         GoogleSearchIE(),
931         PhotobucketIE(),
932         YahooIE(),
933         YahooSearchIE(),
934         DepositFilesIE(),
935         FacebookIE(),
936         BlipTVIE(),
937         BlipTVUserIE(),
938         VimeoIE(),
939         MyVideoIE(),
940         ComedyCentralIE(),
941         EscapistIE(),
942         CollegeHumorIE(),
943         XVideosIE(),
944         SoundcloudSetIE(),
945         SoundcloudIE(),
946         InfoQIE(),
947         MixcloudIE(),
948         StanfordOpenClassroomIE(),
949         MTVIE(),
950         YoukuIE(),
951         XNXXIE(),
952         YouJizzIE(),
953         PornotubeIE(),
954         YouPornIE(),
955         GooglePlusIE(),
956         ArteTvIE(),
957         NBAIE(),
958         WorldStarHipHopIE(),
959         JustinTVIE(),
960         FunnyOrDieIE(),
961         SteamIE(),
962         UstreamIE(),
963         RBMARadioIE(),
964         EightTracksIE(),
965         KeekIE(),
966         TEDIE(),
967         MySpassIE(),
968         SpiegelIE(),
969         LiveLeakIE(),
970         ARDIE(),
971         ZDFIE(),
972         TumblrIE(),
973         BandcampIE(),
974         RedTubeIE(),
975         InaIE(),
976         HowcastIE(),
977         VineIE(),
978         FlickrIE(),
979         TeamcocoIE(),
980         XHamsterIE(),
981         HypemIE(),
982         Vbox7IE(),
983         GametrailersIE(),
984         StatigramIE(),
985         GenericIE()
986     ]
987
988 def get_info_extractor(ie_name):
989     """Returns the info extractor class with the given ie_name"""
990     return globals()[ie_name+'IE']