502df6a1fd12d9886bfbff55312b5dc37bfd1480
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.myvideo import MyVideoIE
37 from .extractor.statigram import StatigramIE
38 from .extractor.photobucket import PhotobucketIE
39 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
40 from .extractor.vimeo import VimeoIE
41 from .extractor.xvideos import XVideosIE
42 from .extractor.yahoo import YahooIE, YahooSearchIE
43 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
44 from .extractor.zdf import ZDFIE
45
46
47
48 class MixcloudIE(InfoExtractor):
49     """Information extractor for www.mixcloud.com"""
50
51     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
52     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
53     IE_NAME = u'mixcloud'
54
55     def report_download_json(self, file_id):
56         """Report JSON download."""
57         self.to_screen(u'Downloading json')
58
59     def get_urls(self, jsonData, fmt, bitrate='best'):
60         """Get urls from 'audio_formats' section in json"""
61         file_url = None
62         try:
63             bitrate_list = jsonData[fmt]
64             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
65                 bitrate = max(bitrate_list) # select highest
66
67             url_list = jsonData[fmt][bitrate]
68         except TypeError: # we have no bitrate info.
69             url_list = jsonData[fmt]
70         return url_list
71
72     def check_urls(self, url_list):
73         """Returns 1st active url from list"""
74         for url in url_list:
75             try:
76                 compat_urllib_request.urlopen(url)
77                 return url
78             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
79                 url = None
80
81         return None
82
83     def _print_formats(self, formats):
84         print('Available formats:')
85         for fmt in formats.keys():
86             for b in formats[fmt]:
87                 try:
88                     ext = formats[fmt][b][0]
89                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
90                 except TypeError: # we have no bitrate info
91                     ext = formats[fmt][0]
92                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
93                     break
94
95     def _real_extract(self, url):
96         mobj = re.match(self._VALID_URL, url)
97         if mobj is None:
98             raise ExtractorError(u'Invalid URL: %s' % url)
99         # extract uploader & filename from url
100         uploader = mobj.group(1).decode('utf-8')
101         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
102
103         # construct API request
104         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
105         # retrieve .json file with links to files
106         request = compat_urllib_request.Request(file_url)
107         try:
108             self.report_download_json(file_url)
109             jsonData = compat_urllib_request.urlopen(request).read()
110         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
111             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
112
113         # parse JSON
114         json_data = json.loads(jsonData)
115         player_url = json_data['player_swf_url']
116         formats = dict(json_data['audio_formats'])
117
118         req_format = self._downloader.params.get('format', None)
119         bitrate = None
120
121         if self._downloader.params.get('listformats', None):
122             self._print_formats(formats)
123             return
124
125         if req_format is None or req_format == 'best':
126             for format_param in formats.keys():
127                 url_list = self.get_urls(formats, format_param)
128                 # check urls
129                 file_url = self.check_urls(url_list)
130                 if file_url is not None:
131                     break # got it!
132         else:
133             if req_format not in formats:
134                 raise ExtractorError(u'Format is not available')
135
136             url_list = self.get_urls(formats, req_format)
137             file_url = self.check_urls(url_list)
138             format_param = req_format
139
140         return [{
141             'id': file_id.decode('utf-8'),
142             'url': file_url.decode('utf-8'),
143             'uploader': uploader.decode('utf-8'),
144             'upload_date': None,
145             'title': json_data['name'],
146             'ext': file_url.split('.')[-1].decode('utf-8'),
147             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
148             'thumbnail': json_data['thumbnail_url'],
149             'description': json_data['description'],
150             'player_url': player_url.decode('utf-8'),
151         }]
152
153 class StanfordOpenClassroomIE(InfoExtractor):
154     """Information extractor for Stanford's Open ClassRoom"""
155
156     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
157     IE_NAME = u'stanfordoc'
158
159     def _real_extract(self, url):
160         mobj = re.match(self._VALID_URL, url)
161         if mobj is None:
162             raise ExtractorError(u'Invalid URL: %s' % url)
163
164         if mobj.group('course') and mobj.group('video'): # A specific video
165             course = mobj.group('course')
166             video = mobj.group('video')
167             info = {
168                 'id': course + '_' + video,
169                 'uploader': None,
170                 'upload_date': None,
171             }
172
173             self.report_extraction(info['id'])
174             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
175             xmlUrl = baseUrl + video + '.xml'
176             try:
177                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
178             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
179                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
180             mdoc = xml.etree.ElementTree.fromstring(metaXml)
181             try:
182                 info['title'] = mdoc.findall('./title')[0].text
183                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
184             except IndexError:
185                 raise ExtractorError(u'Invalid metadata XML file')
186             info['ext'] = info['url'].rpartition('.')[2]
187             return [info]
188         elif mobj.group('course'): # A course page
189             course = mobj.group('course')
190             info = {
191                 'id': course,
192                 'type': 'playlist',
193                 'uploader': None,
194                 'upload_date': None,
195             }
196
197             coursepage = self._download_webpage(url, info['id'],
198                                         note='Downloading course info page',
199                                         errnote='Unable to download course info page')
200
201             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
202
203             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
204                 coursepage, u'description', fatal=False)
205
206             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
207             info['list'] = [
208                 {
209                     'type': 'reference',
210                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
211                 }
212                     for vpage in links]
213             results = []
214             for entry in info['list']:
215                 assert entry['type'] == 'reference'
216                 results += self.extract(entry['url'])
217             return results
218         else: # Root page
219             info = {
220                 'id': 'Stanford OpenClassroom',
221                 'type': 'playlist',
222                 'uploader': None,
223                 'upload_date': None,
224             }
225
226             self.report_download_webpage(info['id'])
227             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
228             try:
229                 rootpage = compat_urllib_request.urlopen(rootURL).read()
230             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
231                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
232
233             info['title'] = info['id']
234
235             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
236             info['list'] = [
237                 {
238                     'type': 'reference',
239                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
240                 }
241                     for cpage in links]
242
243             results = []
244             for entry in info['list']:
245                 assert entry['type'] == 'reference'
246                 results += self.extract(entry['url'])
247             return results
248
249 class MTVIE(InfoExtractor):
250     """Information extractor for MTV.com"""
251
252     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
253     IE_NAME = u'mtv'
254
255     def _real_extract(self, url):
256         mobj = re.match(self._VALID_URL, url)
257         if mobj is None:
258             raise ExtractorError(u'Invalid URL: %s' % url)
259         if not mobj.group('proto'):
260             url = 'http://' + url
261         video_id = mobj.group('videoid')
262
263         webpage = self._download_webpage(url, video_id)
264
265         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
266             webpage, u'song name', fatal=False)
267
268         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
269             webpage, u'title')
270
271         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
272             webpage, u'mtvn_uri', fatal=False)
273
274         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
275             webpage, u'content id', fatal=False)
276
277         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
278         self.report_extraction(video_id)
279         request = compat_urllib_request.Request(videogen_url)
280         try:
281             metadataXml = compat_urllib_request.urlopen(request).read()
282         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
283             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
284
285         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
286         renditions = mdoc.findall('.//rendition')
287
288         # For now, always pick the highest quality.
289         rendition = renditions[-1]
290
291         try:
292             _,_,ext = rendition.attrib['type'].partition('/')
293             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
294             video_url = rendition.find('./src').text
295         except KeyError:
296             raise ExtractorError('Invalid rendition field.')
297
298         info = {
299             'id': video_id,
300             'url': video_url,
301             'uploader': performer,
302             'upload_date': None,
303             'title': video_title,
304             'ext': ext,
305             'format': format,
306         }
307
308         return [info]
309
310
311 class YoukuIE(InfoExtractor):
312     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
313
314     def _gen_sid(self):
315         nowTime = int(time.time() * 1000)
316         random1 = random.randint(1000,1998)
317         random2 = random.randint(1000,9999)
318
319         return "%d%d%d" %(nowTime,random1,random2)
320
321     def _get_file_ID_mix_string(self, seed):
322         mixed = []
323         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
324         seed = float(seed)
325         for i in range(len(source)):
326             seed  =  (seed * 211 + 30031 ) % 65536
327             index  =  math.floor(seed / 65536 * len(source) )
328             mixed.append(source[int(index)])
329             source.remove(source[int(index)])
330         #return ''.join(mixed)
331         return mixed
332
333     def _get_file_id(self, fileId, seed):
334         mixed = self._get_file_ID_mix_string(seed)
335         ids = fileId.split('*')
336         realId = []
337         for ch in ids:
338             if ch:
339                 realId.append(mixed[int(ch)])
340         return ''.join(realId)
341
342     def _real_extract(self, url):
343         mobj = re.match(self._VALID_URL, url)
344         if mobj is None:
345             raise ExtractorError(u'Invalid URL: %s' % url)
346         video_id = mobj.group('ID')
347
348         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
349
350         jsondata = self._download_webpage(info_url, video_id)
351
352         self.report_extraction(video_id)
353         try:
354             config = json.loads(jsondata)
355
356             video_title =  config['data'][0]['title']
357             seed = config['data'][0]['seed']
358
359             format = self._downloader.params.get('format', None)
360             supported_format = list(config['data'][0]['streamfileids'].keys())
361
362             if format is None or format == 'best':
363                 if 'hd2' in supported_format:
364                     format = 'hd2'
365                 else:
366                     format = 'flv'
367                 ext = u'flv'
368             elif format == 'worst':
369                 format = 'mp4'
370                 ext = u'mp4'
371             else:
372                 format = 'flv'
373                 ext = u'flv'
374
375
376             fileid = config['data'][0]['streamfileids'][format]
377             keys = [s['k'] for s in config['data'][0]['segs'][format]]
378         except (UnicodeDecodeError, ValueError, KeyError):
379             raise ExtractorError(u'Unable to extract info section')
380
381         files_info=[]
382         sid = self._gen_sid()
383         fileid = self._get_file_id(fileid, seed)
384
385         #column 8,9 of fileid represent the segment number
386         #fileid[7:9] should be changed
387         for index, key in enumerate(keys):
388
389             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
390             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
391
392             info = {
393                 'id': '%s_part%02d' % (video_id, index),
394                 'url': download_url,
395                 'uploader': None,
396                 'upload_date': None,
397                 'title': video_title,
398                 'ext': ext,
399             }
400             files_info.append(info)
401
402         return files_info
403
404
405 class XNXXIE(InfoExtractor):
406     """Information extractor for xnxx.com"""
407
408     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
409     IE_NAME = u'xnxx'
410     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
411     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
412     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
413
414     def _real_extract(self, url):
415         mobj = re.match(self._VALID_URL, url)
416         if mobj is None:
417             raise ExtractorError(u'Invalid URL: %s' % url)
418         video_id = mobj.group(1)
419
420         # Get webpage content
421         webpage = self._download_webpage(url, video_id)
422
423         video_url = self._search_regex(self.VIDEO_URL_RE,
424             webpage, u'video URL')
425         video_url = compat_urllib_parse.unquote(video_url)
426
427         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
428             webpage, u'title')
429
430         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
431             webpage, u'thumbnail', fatal=False)
432
433         return [{
434             'id': video_id,
435             'url': video_url,
436             'uploader': None,
437             'upload_date': None,
438             'title': video_title,
439             'ext': 'flv',
440             'thumbnail': video_thumbnail,
441             'description': None,
442         }]
443
444
445
446 class NBAIE(InfoExtractor):
447     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
448     IE_NAME = u'nba'
449
450     def _real_extract(self, url):
451         mobj = re.match(self._VALID_URL, url)
452         if mobj is None:
453             raise ExtractorError(u'Invalid URL: %s' % url)
454
455         video_id = mobj.group(1)
456
457         webpage = self._download_webpage(url, video_id)
458
459         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
460
461         shortened_video_id = video_id.rpartition('/')[2]
462         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
463             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
464
465         # It isn't there in the HTML it returns to us
466         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
467
468         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
469
470         info = {
471             'id': shortened_video_id,
472             'url': video_url,
473             'ext': 'mp4',
474             'title': title,
475             # 'uploader_date': uploader_date,
476             'description': description,
477         }
478         return [info]
479
480 class JustinTVIE(InfoExtractor):
481     """Information extractor for justin.tv and twitch.tv"""
482     # TODO: One broadcast may be split into multiple videos. The key
483     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
484     # starts at 1 and increases. Can we treat all parts as one video?
485
486     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
487         (?:
488             (?P<channelid>[^/]+)|
489             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
490             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
491         )
492         /?(?:\#.*)?$
493         """
494     _JUSTIN_PAGE_LIMIT = 100
495     IE_NAME = u'justin.tv'
496
497     def report_download_page(self, channel, offset):
498         """Report attempt to download a single page of videos."""
499         self.to_screen(u'%s: Downloading video information from %d to %d' %
500                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
501
502     # Return count of items, list of *valid* items
503     def _parse_page(self, url, video_id):
504         webpage = self._download_webpage(url, video_id,
505                                          u'Downloading video info JSON',
506                                          u'unable to download video info JSON')
507
508         response = json.loads(webpage)
509         if type(response) != list:
510             error_text = response.get('error', 'unknown error')
511             raise ExtractorError(u'Justin.tv API: %s' % error_text)
512         info = []
513         for clip in response:
514             video_url = clip['video_file_url']
515             if video_url:
516                 video_extension = os.path.splitext(video_url)[1][1:]
517                 video_date = re.sub('-', '', clip['start_time'][:10])
518                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
519                 video_id = clip['id']
520                 video_title = clip.get('title', video_id)
521                 info.append({
522                     'id': video_id,
523                     'url': video_url,
524                     'title': video_title,
525                     'uploader': clip.get('channel_name', video_uploader_id),
526                     'uploader_id': video_uploader_id,
527                     'upload_date': video_date,
528                     'ext': video_extension,
529                 })
530         return (len(response), info)
531
532     def _real_extract(self, url):
533         mobj = re.match(self._VALID_URL, url)
534         if mobj is None:
535             raise ExtractorError(u'invalid URL: %s' % url)
536
537         api_base = 'http://api.justin.tv'
538         paged = False
539         if mobj.group('channelid'):
540             paged = True
541             video_id = mobj.group('channelid')
542             api = api_base + '/channel/archives/%s.json' % video_id
543         elif mobj.group('chapterid'):
544             chapter_id = mobj.group('chapterid')
545
546             webpage = self._download_webpage(url, chapter_id)
547             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
548             if not m:
549                 raise ExtractorError(u'Cannot find archive of a chapter')
550             archive_id = m.group(1)
551
552             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
553             chapter_info_xml = self._download_webpage(api, chapter_id,
554                                              note=u'Downloading chapter information',
555                                              errnote=u'Chapter information download failed')
556             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
557             for a in doc.findall('.//archive'):
558                 if archive_id == a.find('./id').text:
559                     break
560             else:
561                 raise ExtractorError(u'Could not find chapter in chapter information')
562
563             video_url = a.find('./video_file_url').text
564             video_ext = video_url.rpartition('.')[2] or u'flv'
565
566             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
567             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
568                                    note='Downloading chapter metadata',
569                                    errnote='Download of chapter metadata failed')
570             chapter_info = json.loads(chapter_info_json)
571
572             bracket_start = int(doc.find('.//bracket_start').text)
573             bracket_end = int(doc.find('.//bracket_end').text)
574
575             # TODO determine start (and probably fix up file)
576             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
577             #video_url += u'?start=' + TODO:start_timestamp
578             # bracket_start is 13290, but we want 51670615
579             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
580                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
581
582             info = {
583                 'id': u'c' + chapter_id,
584                 'url': video_url,
585                 'ext': video_ext,
586                 'title': chapter_info['title'],
587                 'thumbnail': chapter_info['preview'],
588                 'description': chapter_info['description'],
589                 'uploader': chapter_info['channel']['display_name'],
590                 'uploader_id': chapter_info['channel']['name'],
591             }
592             return [info]
593         else:
594             video_id = mobj.group('videoid')
595             api = api_base + '/broadcast/by_archive/%s.json' % video_id
596
597         self.report_extraction(video_id)
598
599         info = []
600         offset = 0
601         limit = self._JUSTIN_PAGE_LIMIT
602         while True:
603             if paged:
604                 self.report_download_page(video_id, offset)
605             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
606             page_count, page_info = self._parse_page(page_url, video_id)
607             info.extend(page_info)
608             if not paged or page_count != limit:
609                 break
610             offset += limit
611         return info
612
613 class FunnyOrDieIE(InfoExtractor):
614     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
615
616     def _real_extract(self, url):
617         mobj = re.match(self._VALID_URL, url)
618         if mobj is None:
619             raise ExtractorError(u'invalid URL: %s' % url)
620
621         video_id = mobj.group('id')
622         webpage = self._download_webpage(url, video_id)
623
624         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
625             webpage, u'video URL', flags=re.DOTALL)
626
627         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
628             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
629
630         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
631             webpage, u'description', fatal=False, flags=re.DOTALL)
632
633         info = {
634             'id': video_id,
635             'url': video_url,
636             'ext': 'mp4',
637             'title': title,
638             'description': video_description,
639         }
640         return [info]
641
642 class SteamIE(InfoExtractor):
643     _VALID_URL = r"""http://store\.steampowered\.com/
644                 (agecheck/)?
645                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
646                 (?P<gameID>\d+)/?
647                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
648                 """
649     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
650     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
651
652     @classmethod
653     def suitable(cls, url):
654         """Receives a URL and returns True if suitable for this IE."""
655         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
656
657     def _real_extract(self, url):
658         m = re.match(self._VALID_URL, url, re.VERBOSE)
659         gameID = m.group('gameID')
660
661         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
662         webpage = self._download_webpage(videourl, gameID)
663
664         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
665             videourl = self._AGECHECK_TEMPLATE % gameID
666             self.report_age_confirmation()
667             webpage = self._download_webpage(videourl, gameID)
668
669         self.report_extraction(gameID)
670         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
671                                              webpage, 'game title')
672
673         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
674         mweb = re.finditer(urlRE, webpage)
675         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
676         titles = re.finditer(namesRE, webpage)
677         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
678         thumbs = re.finditer(thumbsRE, webpage)
679         videos = []
680         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
681             video_id = vid.group('videoID')
682             title = vtitle.group('videoName')
683             video_url = vid.group('videoURL')
684             video_thumb = thumb.group('thumbnail')
685             if not video_url:
686                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
687             info = {
688                 'id':video_id,
689                 'url':video_url,
690                 'ext': 'flv',
691                 'title': unescapeHTML(title),
692                 'thumbnail': video_thumb
693                   }
694             videos.append(info)
695         return [self.playlist_result(videos, gameID, game_title)]
696
697 class UstreamIE(InfoExtractor):
698     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
699     IE_NAME = u'ustream'
700
701     def _real_extract(self, url):
702         m = re.match(self._VALID_URL, url)
703         video_id = m.group('videoID')
704
705         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
706         webpage = self._download_webpage(url, video_id)
707
708         self.report_extraction(video_id)
709
710         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
711             webpage, u'title')
712
713         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
714             webpage, u'uploader', fatal=False, flags=re.DOTALL)
715
716         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
717             webpage, u'thumbnail', fatal=False)
718
719         info = {
720                 'id': video_id,
721                 'url': video_url,
722                 'ext': 'flv',
723                 'title': video_title,
724                 'uploader': uploader,
725                 'thumbnail': thumbnail,
726                }
727         return info
728
729 class WorldStarHipHopIE(InfoExtractor):
730     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
731     IE_NAME = u'WorldStarHipHop'
732
733     def _real_extract(self, url):
734         m = re.match(self._VALID_URL, url)
735         video_id = m.group('id')
736
737         webpage_src = self._download_webpage(url, video_id)
738
739         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
740             webpage_src, u'video URL')
741
742         if 'mp4' in video_url:
743             ext = 'mp4'
744         else:
745             ext = 'flv'
746
747         video_title = self._html_search_regex(r"<title>(.*)</title>",
748             webpage_src, u'title')
749
750         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
751         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
752             webpage_src, u'thumbnail', fatal=False)
753
754         if not thumbnail:
755             _title = r"""candytitles.*>(.*)</span>"""
756             mobj = re.search(_title, webpage_src)
757             if mobj is not None:
758                 video_title = mobj.group(1)
759
760         results = [{
761                     'id': video_id,
762                     'url' : video_url,
763                     'title' : video_title,
764                     'thumbnail' : thumbnail,
765                     'ext' : ext,
766                     }]
767         return results
768
769 class RBMARadioIE(InfoExtractor):
770     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
771
772     def _real_extract(self, url):
773         m = re.match(self._VALID_URL, url)
774         video_id = m.group('videoID')
775
776         webpage = self._download_webpage(url, video_id)
777
778         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
779             webpage, u'json data', flags=re.MULTILINE)
780
781         try:
782             data = json.loads(json_data)
783         except ValueError as e:
784             raise ExtractorError(u'Invalid JSON: ' + str(e))
785
786         video_url = data['akamai_url'] + '&cbr=256'
787         url_parts = compat_urllib_parse_urlparse(video_url)
788         video_ext = url_parts.path.rpartition('.')[2]
789         info = {
790                 'id': video_id,
791                 'url': video_url,
792                 'ext': video_ext,
793                 'title': data['title'],
794                 'description': data.get('teaser_text'),
795                 'location': data.get('country_of_origin'),
796                 'uploader': data.get('host', {}).get('name'),
797                 'uploader_id': data.get('host', {}).get('slug'),
798                 'thumbnail': data.get('image', {}).get('large_url_2x'),
799                 'duration': data.get('duration'),
800         }
801         return [info]
802
803
804 class YouPornIE(InfoExtractor):
805     """Information extractor for youporn.com."""
806     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
807
808     def _print_formats(self, formats):
809         """Print all available formats"""
810         print(u'Available formats:')
811         print(u'ext\t\tformat')
812         print(u'---------------------------------')
813         for format in formats:
814             print(u'%s\t\t%s'  % (format['ext'], format['format']))
815
816     def _specific(self, req_format, formats):
817         for x in formats:
818             if(x["format"]==req_format):
819                 return x
820         return None
821
822     def _real_extract(self, url):
823         mobj = re.match(self._VALID_URL, url)
824         if mobj is None:
825             raise ExtractorError(u'Invalid URL: %s' % url)
826         video_id = mobj.group('videoid')
827
828         req = compat_urllib_request.Request(url)
829         req.add_header('Cookie', 'age_verified=1')
830         webpage = self._download_webpage(req, video_id)
831
832         # Get JSON parameters
833         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
834         try:
835             params = json.loads(json_params)
836         except:
837             raise ExtractorError(u'Invalid JSON')
838
839         self.report_extraction(video_id)
840         try:
841             video_title = params['title']
842             upload_date = unified_strdate(params['release_date_f'])
843             video_description = params['description']
844             video_uploader = params['submitted_by']
845             thumbnail = params['thumbnails'][0]['image']
846         except KeyError:
847             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
848
849         # Get all of the formats available
850         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
851         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
852             webpage, u'download list').strip()
853
854         # Get all of the links from the page
855         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
856         links = re.findall(LINK_RE, download_list_html)
857         if(len(links) == 0):
858             raise ExtractorError(u'ERROR: no known formats available for video')
859
860         self.to_screen(u'Links found: %d' % len(links))
861
862         formats = []
863         for link in links:
864
865             # A link looks like this:
866             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
867             # A path looks like this:
868             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
869             video_url = unescapeHTML( link )
870             path = compat_urllib_parse_urlparse( video_url ).path
871             extension = os.path.splitext( path )[1][1:]
872             format = path.split('/')[4].split('_')[:2]
873             size = format[0]
874             bitrate = format[1]
875             format = "-".join( format )
876             # title = u'%s-%s-%s' % (video_title, size, bitrate)
877
878             formats.append({
879                 'id': video_id,
880                 'url': video_url,
881                 'uploader': video_uploader,
882                 'upload_date': upload_date,
883                 'title': video_title,
884                 'ext': extension,
885                 'format': format,
886                 'thumbnail': thumbnail,
887                 'description': video_description
888             })
889
890         if self._downloader.params.get('listformats', None):
891             self._print_formats(formats)
892             return
893
894         req_format = self._downloader.params.get('format', None)
895         self.to_screen(u'Format: %s' % req_format)
896
897         if req_format is None or req_format == 'best':
898             return [formats[0]]
899         elif req_format == 'worst':
900             return [formats[-1]]
901         elif req_format in ('-1', 'all'):
902             return formats
903         else:
904             format = self._specific( req_format, formats )
905             if result is None:
906                 raise ExtractorError(u'Requested format not available')
907             return [format]
908
909
910
911 class PornotubeIE(InfoExtractor):
912     """Information extractor for pornotube.com."""
913     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
914
915     def _real_extract(self, url):
916         mobj = re.match(self._VALID_URL, url)
917         if mobj is None:
918             raise ExtractorError(u'Invalid URL: %s' % url)
919
920         video_id = mobj.group('videoid')
921         video_title = mobj.group('title')
922
923         # Get webpage content
924         webpage = self._download_webpage(url, video_id)
925
926         # Get the video URL
927         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
928         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
929         video_url = compat_urllib_parse.unquote(video_url)
930
931         #Get the uploaded date
932         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
933         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
934         if upload_date: upload_date = unified_strdate(upload_date)
935
936         info = {'id': video_id,
937                 'url': video_url,
938                 'uploader': None,
939                 'upload_date': upload_date,
940                 'title': video_title,
941                 'ext': 'flv',
942                 'format': 'flv'}
943
944         return [info]
945
946 class YouJizzIE(InfoExtractor):
947     """Information extractor for youjizz.com."""
948     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
949
950     def _real_extract(self, url):
951         mobj = re.match(self._VALID_URL, url)
952         if mobj is None:
953             raise ExtractorError(u'Invalid URL: %s' % url)
954
955         video_id = mobj.group('videoid')
956
957         # Get webpage content
958         webpage = self._download_webpage(url, video_id)
959
960         # Get the video title
961         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
962             webpage, u'title').strip()
963
964         # Get the embed page
965         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
966         if result is None:
967             raise ExtractorError(u'ERROR: unable to extract embed page')
968
969         embed_page_url = result.group(0).strip()
970         video_id = result.group('videoid')
971
972         webpage = self._download_webpage(embed_page_url, video_id)
973
974         # Get the video URL
975         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
976             webpage, u'video URL')
977
978         info = {'id': video_id,
979                 'url': video_url,
980                 'title': video_title,
981                 'ext': 'flv',
982                 'format': 'flv',
983                 'player_url': embed_page_url}
984
985         return [info]
986
987 class EightTracksIE(InfoExtractor):
988     IE_NAME = '8tracks'
989     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
990
991     def _real_extract(self, url):
992         mobj = re.match(self._VALID_URL, url)
993         if mobj is None:
994             raise ExtractorError(u'Invalid URL: %s' % url)
995         playlist_id = mobj.group('id')
996
997         webpage = self._download_webpage(url, playlist_id)
998
999         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1000         data = json.loads(json_like)
1001
1002         session = str(random.randint(0, 1000000000))
1003         mix_id = data['id']
1004         track_count = data['tracks_count']
1005         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1006         next_url = first_url
1007         res = []
1008         for i in itertools.count():
1009             api_json = self._download_webpage(next_url, playlist_id,
1010                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1011                 errnote=u'Failed to download song information')
1012             api_data = json.loads(api_json)
1013             track_data = api_data[u'set']['track']
1014             info = {
1015                 'id': track_data['id'],
1016                 'url': track_data['track_file_stream_url'],
1017                 'title': track_data['performer'] + u' - ' + track_data['name'],
1018                 'raw_title': track_data['name'],
1019                 'uploader_id': data['user']['login'],
1020                 'ext': 'm4a',
1021             }
1022             res.append(info)
1023             if api_data['set']['at_last_track']:
1024                 break
1025             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1026         return res
1027
1028 class KeekIE(InfoExtractor):
1029     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1030     IE_NAME = u'keek'
1031
1032     def _real_extract(self, url):
1033         m = re.match(self._VALID_URL, url)
1034         video_id = m.group('videoID')
1035
1036         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1037         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1038         webpage = self._download_webpage(url, video_id)
1039
1040         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1041             webpage, u'title')
1042
1043         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1044             webpage, u'uploader', fatal=False)
1045
1046         info = {
1047                 'id': video_id,
1048                 'url': video_url,
1049                 'ext': 'mp4',
1050                 'title': video_title,
1051                 'thumbnail': thumbnail,
1052                 'uploader': uploader
1053         }
1054         return [info]
1055
1056 class TEDIE(InfoExtractor):
1057     _VALID_URL=r'''http://www\.ted\.com/
1058                    (
1059                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1060                         |
1061                         ((?P<type_talk>talks)) # We have a simple talk
1062                    )
1063                    (/lang/(.*?))? # The url may contain the language
1064                    /(?P<name>\w+) # Here goes the name and then ".html"
1065                    '''
1066
1067     @classmethod
1068     def suitable(cls, url):
1069         """Receives a URL and returns True if suitable for this IE."""
1070         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1071
1072     def _real_extract(self, url):
1073         m=re.match(self._VALID_URL, url, re.VERBOSE)
1074         if m.group('type_talk'):
1075             return [self._talk_info(url)]
1076         else :
1077             playlist_id=m.group('playlist_id')
1078             name=m.group('name')
1079             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1080             return [self._playlist_videos_info(url,name,playlist_id)]
1081
1082     def _playlist_videos_info(self,url,name,playlist_id=0):
1083         '''Returns the videos of the playlist'''
1084         video_RE=r'''
1085                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1086                      ([.\s]*?)data-playlist_item_id="(\d+)"
1087                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1088                      '''
1089         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1090         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1091         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1092         m_names=re.finditer(video_name_RE,webpage)
1093
1094         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1095                                                  webpage, 'playlist title')
1096
1097         playlist_entries = []
1098         for m_video, m_name in zip(m_videos,m_names):
1099             video_id=m_video.group('video_id')
1100             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1101             playlist_entries.append(self.url_result(talk_url, 'TED'))
1102         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1103
1104     def _talk_info(self, url, video_id=0):
1105         """Return the video for the talk in the url"""
1106         m = re.match(self._VALID_URL, url,re.VERBOSE)
1107         video_name = m.group('name')
1108         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1109         self.report_extraction(video_name)
1110         # If the url includes the language we get the title translated
1111         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1112                                         webpage, 'title')
1113         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1114                                     webpage, 'json data')
1115         info = json.loads(json_data)
1116         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1117                                        webpage, 'description', flags = re.DOTALL)
1118         
1119         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1120                                        webpage, 'thumbnail')
1121         info = {
1122                 'id': info['id'],
1123                 'url': info['htmlStreams'][-1]['file'],
1124                 'ext': 'mp4',
1125                 'title': title,
1126                 'thumbnail': thumbnail,
1127                 'description': desc,
1128                 }
1129         return info
1130
1131 class MySpassIE(InfoExtractor):
1132     _VALID_URL = r'http://www.myspass.de/.*'
1133
1134     def _real_extract(self, url):
1135         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1136
1137         # video id is the last path element of the URL
1138         # usually there is a trailing slash, so also try the second but last
1139         url_path = compat_urllib_parse_urlparse(url).path
1140         url_parent_path, video_id = os.path.split(url_path)
1141         if not video_id:
1142             _, video_id = os.path.split(url_parent_path)
1143
1144         # get metadata
1145         metadata_url = META_DATA_URL_TEMPLATE % video_id
1146         metadata_text = self._download_webpage(metadata_url, video_id)
1147         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1148
1149         # extract values from metadata
1150         url_flv_el = metadata.find('url_flv')
1151         if url_flv_el is None:
1152             raise ExtractorError(u'Unable to extract download url')
1153         video_url = url_flv_el.text
1154         extension = os.path.splitext(video_url)[1][1:]
1155         title_el = metadata.find('title')
1156         if title_el is None:
1157             raise ExtractorError(u'Unable to extract title')
1158         title = title_el.text
1159         format_id_el = metadata.find('format_id')
1160         if format_id_el is None:
1161             format = ext
1162         else:
1163             format = format_id_el.text
1164         description_el = metadata.find('description')
1165         if description_el is not None:
1166             description = description_el.text
1167         else:
1168             description = None
1169         imagePreview_el = metadata.find('imagePreview')
1170         if imagePreview_el is not None:
1171             thumbnail = imagePreview_el.text
1172         else:
1173             thumbnail = None
1174         info = {
1175             'id': video_id,
1176             'url': video_url,
1177             'title': title,
1178             'ext': extension,
1179             'format': format,
1180             'thumbnail': thumbnail,
1181             'description': description
1182         }
1183         return [info]
1184
1185 class SpiegelIE(InfoExtractor):
1186     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1187
1188     def _real_extract(self, url):
1189         m = re.match(self._VALID_URL, url)
1190         video_id = m.group('videoID')
1191
1192         webpage = self._download_webpage(url, video_id)
1193
1194         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1195             webpage, u'title')
1196
1197         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1198         xml_code = self._download_webpage(xml_url, video_id,
1199                     note=u'Downloading XML', errnote=u'Failed to download XML')
1200
1201         idoc = xml.etree.ElementTree.fromstring(xml_code)
1202         last_type = idoc[-1]
1203         filename = last_type.findall('./filename')[0].text
1204         duration = float(last_type.findall('./duration')[0].text)
1205
1206         video_url = 'http://video2.spiegel.de/flash/' + filename
1207         video_ext = filename.rpartition('.')[2]
1208         info = {
1209             'id': video_id,
1210             'url': video_url,
1211             'ext': video_ext,
1212             'title': video_title,
1213             'duration': duration,
1214         }
1215         return [info]
1216
1217 class LiveLeakIE(InfoExtractor):
1218
1219     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1220     IE_NAME = u'liveleak'
1221
1222     def _real_extract(self, url):
1223         mobj = re.match(self._VALID_URL, url)
1224         if mobj is None:
1225             raise ExtractorError(u'Invalid URL: %s' % url)
1226
1227         video_id = mobj.group('video_id')
1228
1229         webpage = self._download_webpage(url, video_id)
1230
1231         video_url = self._search_regex(r'file: "(.*?)",',
1232             webpage, u'video URL')
1233
1234         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1235             webpage, u'title').replace('LiveLeak.com -', '').strip()
1236
1237         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1238             webpage, u'description', fatal=False)
1239
1240         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1241             webpage, u'uploader', fatal=False)
1242
1243         info = {
1244             'id':  video_id,
1245             'url': video_url,
1246             'ext': 'mp4',
1247             'title': video_title,
1248             'description': video_description,
1249             'uploader': video_uploader
1250         }
1251
1252         return [info]
1253
1254
1255
1256 class TumblrIE(InfoExtractor):
1257     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1258
1259     def _real_extract(self, url):
1260         m_url = re.match(self._VALID_URL, url)
1261         video_id = m_url.group('id')
1262         blog = m_url.group('blog_name')
1263
1264         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1265         webpage = self._download_webpage(url, video_id)
1266
1267         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1268         video = re.search(re_video, webpage)
1269         if video is None:
1270            raise ExtractorError(u'Unable to extract video')
1271         video_url = video.group('video_url')
1272         ext = video.group('ext')
1273
1274         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1275             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1276         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1277
1278         # The only place where you can get a title, it's not complete,
1279         # but searching in other places doesn't work for all videos
1280         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1281             webpage, u'title', flags=re.DOTALL)
1282
1283         return [{'id': video_id,
1284                  'url': video_url,
1285                  'title': video_title,
1286                  'thumbnail': video_thumbnail,
1287                  'ext': ext
1288                  }]
1289
1290 class BandcampIE(InfoExtractor):
1291     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1292
1293     def _real_extract(self, url):
1294         mobj = re.match(self._VALID_URL, url)
1295         title = mobj.group('title')
1296         webpage = self._download_webpage(url, title)
1297         # We get the link to the free download page
1298         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1299         if m_download is None:
1300             raise ExtractorError(u'No free songs found')
1301
1302         download_link = m_download.group(1)
1303         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
1304                        webpage, re.MULTILINE|re.DOTALL).group('id')
1305
1306         download_webpage = self._download_webpage(download_link, id,
1307                                                   'Downloading free downloads page')
1308         # We get the dictionary of the track from some javascrip code
1309         info = re.search(r'items: (.*?),$',
1310                          download_webpage, re.MULTILINE).group(1)
1311         info = json.loads(info)[0]
1312         # We pick mp3-320 for now, until format selection can be easily implemented.
1313         mp3_info = info[u'downloads'][u'mp3-320']
1314         # If we try to use this url it says the link has expired
1315         initial_url = mp3_info[u'url']
1316         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1317         m_url = re.match(re_url, initial_url)
1318         #We build the url we will use to get the final track url
1319         # This url is build in Bandcamp in the script download_bunde_*.js
1320         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1321         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1322         # If we could correctly generate the .rand field the url would be
1323         #in the "download_url" key
1324         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1325
1326         track_info = {'id':id,
1327                       'title' : info[u'title'],
1328                       'ext' :   'mp3',
1329                       'url' :   final_url,
1330                       'thumbnail' : info[u'thumb_url'],
1331                       'uploader' :  info[u'artist']
1332                       }
1333
1334         return [track_info]
1335
1336 class RedTubeIE(InfoExtractor):
1337     """Information Extractor for redtube"""
1338     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1339
1340     def _real_extract(self,url):
1341         mobj = re.match(self._VALID_URL, url)
1342         if mobj is None:
1343             raise ExtractorError(u'Invalid URL: %s' % url)
1344
1345         video_id = mobj.group('id')
1346         video_extension = 'mp4'        
1347         webpage = self._download_webpage(url, video_id)
1348
1349         self.report_extraction(video_id)
1350
1351         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1352             webpage, u'video URL')
1353
1354         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1355             webpage, u'title')
1356
1357         return [{
1358             'id':       video_id,
1359             'url':      video_url,
1360             'ext':      video_extension,
1361             'title':    video_title,
1362         }]
1363         
1364 class InaIE(InfoExtractor):
1365     """Information Extractor for Ina.fr"""
1366     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1367
1368     def _real_extract(self,url):
1369         mobj = re.match(self._VALID_URL, url)
1370
1371         video_id = mobj.group('id')
1372         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1373         video_extension = 'mp4'
1374         webpage = self._download_webpage(mrss_url, video_id)
1375
1376         self.report_extraction(video_id)
1377
1378         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1379             webpage, u'video URL')
1380
1381         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1382             webpage, u'title')
1383
1384         return [{
1385             'id':       video_id,
1386             'url':      video_url,
1387             'ext':      video_extension,
1388             'title':    video_title,
1389         }]
1390
1391 class HowcastIE(InfoExtractor):
1392     """Information Extractor for Howcast.com"""
1393     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1394
1395     def _real_extract(self, url):
1396         mobj = re.match(self._VALID_URL, url)
1397
1398         video_id = mobj.group('id')
1399         webpage_url = 'http://www.howcast.com/videos/' + video_id
1400         webpage = self._download_webpage(webpage_url, video_id)
1401
1402         self.report_extraction(video_id)
1403
1404         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1405             webpage, u'video URL')
1406
1407         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1408             webpage, u'title')
1409
1410         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1411             webpage, u'description', fatal=False)
1412
1413         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1414             webpage, u'thumbnail', fatal=False)
1415
1416         return [{
1417             'id':       video_id,
1418             'url':      video_url,
1419             'ext':      'mp4',
1420             'title':    video_title,
1421             'description': video_description,
1422             'thumbnail': thumbnail,
1423         }]
1424
1425 class VineIE(InfoExtractor):
1426     """Information Extractor for Vine.co"""
1427     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1428
1429     def _real_extract(self, url):
1430         mobj = re.match(self._VALID_URL, url)
1431
1432         video_id = mobj.group('id')
1433         webpage_url = 'https://vine.co/v/' + video_id
1434         webpage = self._download_webpage(webpage_url, video_id)
1435
1436         self.report_extraction(video_id)
1437
1438         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1439             webpage, u'video URL')
1440
1441         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1442             webpage, u'title')
1443
1444         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1445             webpage, u'thumbnail', fatal=False)
1446
1447         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1448             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1449
1450         return [{
1451             'id':        video_id,
1452             'url':       video_url,
1453             'ext':       'mp4',
1454             'title':     video_title,
1455             'thumbnail': thumbnail,
1456             'uploader':  uploader,
1457         }]
1458
1459 class FlickrIE(InfoExtractor):
1460     """Information Extractor for Flickr videos"""
1461     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1462
1463     def _real_extract(self, url):
1464         mobj = re.match(self._VALID_URL, url)
1465
1466         video_id = mobj.group('id')
1467         video_uploader_id = mobj.group('uploader_id')
1468         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1469         webpage = self._download_webpage(webpage_url, video_id)
1470
1471         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1472
1473         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1474         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1475
1476         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1477             first_xml, u'node_id')
1478
1479         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1480         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1481
1482         self.report_extraction(video_id)
1483
1484         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1485         if mobj is None:
1486             raise ExtractorError(u'Unable to extract video url')
1487         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1488
1489         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1490             webpage, u'video title')
1491
1492         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1493             webpage, u'description', fatal=False)
1494
1495         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1496             webpage, u'thumbnail', fatal=False)
1497
1498         return [{
1499             'id':          video_id,
1500             'url':         video_url,
1501             'ext':         'mp4',
1502             'title':       video_title,
1503             'description': video_description,
1504             'thumbnail':   thumbnail,
1505             'uploader_id': video_uploader_id,
1506         }]
1507
1508 class TeamcocoIE(InfoExtractor):
1509     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1510
1511     def _real_extract(self, url):
1512         mobj = re.match(self._VALID_URL, url)
1513         if mobj is None:
1514             raise ExtractorError(u'Invalid URL: %s' % url)
1515         url_title = mobj.group('url_title')
1516         webpage = self._download_webpage(url, url_title)
1517
1518         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1519             webpage, u'video id')
1520
1521         self.report_extraction(video_id)
1522
1523         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1524             webpage, u'title')
1525
1526         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1527             webpage, u'thumbnail', fatal=False)
1528
1529         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1530             webpage, u'description', fatal=False)
1531
1532         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1533         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1534
1535         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1536             data, u'video URL')
1537
1538         return [{
1539             'id':          video_id,
1540             'url':         video_url,
1541             'ext':         'mp4',
1542             'title':       video_title,
1543             'thumbnail':   thumbnail,
1544             'description': video_description,
1545         }]
1546
1547 class XHamsterIE(InfoExtractor):
1548     """Information Extractor for xHamster"""
1549     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1550
1551     def _real_extract(self,url):
1552         mobj = re.match(self._VALID_URL, url)
1553
1554         video_id = mobj.group('id')
1555         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1556         webpage = self._download_webpage(mrss_url, video_id)
1557
1558         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1559         if mobj is None:
1560             raise ExtractorError(u'Unable to extract media URL')
1561         if len(mobj.group('server')) == 0:
1562             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1563         else:
1564             video_url = mobj.group('server')+'/key='+mobj.group('file')
1565         video_extension = video_url.split('.')[-1]
1566
1567         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1568             webpage, u'title')
1569
1570         # Can't see the description anywhere in the UI
1571         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1572         #     webpage, u'description', fatal=False)
1573         # if video_description: video_description = unescapeHTML(video_description)
1574
1575         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1576         if mobj:
1577             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1578         else:
1579             video_upload_date = None
1580             self._downloader.report_warning(u'Unable to extract upload date')
1581
1582         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1583             webpage, u'uploader id', default=u'anonymous')
1584
1585         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1586             webpage, u'thumbnail', fatal=False)
1587
1588         return [{
1589             'id':       video_id,
1590             'url':      video_url,
1591             'ext':      video_extension,
1592             'title':    video_title,
1593             # 'description': video_description,
1594             'upload_date': video_upload_date,
1595             'uploader_id': video_uploader_id,
1596             'thumbnail': video_thumbnail
1597         }]
1598
1599 class HypemIE(InfoExtractor):
1600     """Information Extractor for hypem"""
1601     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1602
1603     def _real_extract(self, url):
1604         mobj = re.match(self._VALID_URL, url)
1605         if mobj is None:
1606             raise ExtractorError(u'Invalid URL: %s' % url)
1607         track_id = mobj.group(1)
1608
1609         data = { 'ax': 1, 'ts': time.time() }
1610         data_encoded = compat_urllib_parse.urlencode(data)
1611         complete_url = url + "?" + data_encoded
1612         request = compat_urllib_request.Request(complete_url)
1613         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1614         cookie = urlh.headers.get('Set-Cookie', '')
1615
1616         self.report_extraction(track_id)
1617
1618         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1619             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1620         try:
1621             track_list = json.loads(html_tracks)
1622             track = track_list[u'tracks'][0]
1623         except ValueError:
1624             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1625
1626         key = track[u"key"]
1627         track_id = track[u"id"]
1628         artist = track[u"artist"]
1629         title = track[u"song"]
1630
1631         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1632         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1633         request.add_header('cookie', cookie)
1634         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1635         try:
1636             song_data = json.loads(song_data_json)
1637         except ValueError:
1638             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1639         final_url = song_data[u"url"]
1640
1641         return [{
1642             'id':       track_id,
1643             'url':      final_url,
1644             'ext':      "mp3",
1645             'title':    title,
1646             'artist':   artist,
1647         }]
1648
1649 class Vbox7IE(InfoExtractor):
1650     """Information Extractor for Vbox7"""
1651     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1652
1653     def _real_extract(self,url):
1654         mobj = re.match(self._VALID_URL, url)
1655         if mobj is None:
1656             raise ExtractorError(u'Invalid URL: %s' % url)
1657         video_id = mobj.group(1)
1658
1659         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1660         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1661         redirect_url = urlh.geturl() + new_location
1662         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1663
1664         title = self._html_search_regex(r'<title>(.*)</title>',
1665             webpage, u'title').split('/')[0].strip()
1666
1667         ext = "flv"
1668         info_url = "http://vbox7.com/play/magare.do"
1669         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1670         info_request = compat_urllib_request.Request(info_url, data)
1671         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1672         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1673         if info_response is None:
1674             raise ExtractorError(u'Unable to extract the media url')
1675         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1676
1677         return [{
1678             'id':        video_id,
1679             'url':       final_url,
1680             'ext':       ext,
1681             'title':     title,
1682             'thumbnail': thumbnail_url,
1683         }]
1684
1685
1686 def gen_extractors():
1687     """ Return a list of an instance of every supported extractor.
1688     The order does matter; the first extractor matched is the one handling the URL.
1689     """
1690     return [
1691         YoutubePlaylistIE(),
1692         YoutubeChannelIE(),
1693         YoutubeUserIE(),
1694         YoutubeSearchIE(),
1695         YoutubeIE(),
1696         MetacafeIE(),
1697         DailymotionIE(),
1698         GoogleSearchIE(),
1699         PhotobucketIE(),
1700         YahooIE(),
1701         YahooSearchIE(),
1702         DepositFilesIE(),
1703         FacebookIE(),
1704         BlipTVIE(),
1705         BlipTVUserIE(),
1706         VimeoIE(),
1707         MyVideoIE(),
1708         ComedyCentralIE(),
1709         EscapistIE(),
1710         CollegeHumorIE(),
1711         XVideosIE(),
1712         SoundcloudSetIE(),
1713         SoundcloudIE(),
1714         InfoQIE(),
1715         MixcloudIE(),
1716         StanfordOpenClassroomIE(),
1717         MTVIE(),
1718         YoukuIE(),
1719         XNXXIE(),
1720         YouJizzIE(),
1721         PornotubeIE(),
1722         YouPornIE(),
1723         GooglePlusIE(),
1724         ArteTvIE(),
1725         NBAIE(),
1726         WorldStarHipHopIE(),
1727         JustinTVIE(),
1728         FunnyOrDieIE(),
1729         SteamIE(),
1730         UstreamIE(),
1731         RBMARadioIE(),
1732         EightTracksIE(),
1733         KeekIE(),
1734         TEDIE(),
1735         MySpassIE(),
1736         SpiegelIE(),
1737         LiveLeakIE(),
1738         ARDIE(),
1739         ZDFIE(),
1740         TumblrIE(),
1741         BandcampIE(),
1742         RedTubeIE(),
1743         InaIE(),
1744         HowcastIE(),
1745         VineIE(),
1746         FlickrIE(),
1747         TeamcocoIE(),
1748         XHamsterIE(),
1749         HypemIE(),
1750         Vbox7IE(),
1751         GametrailersIE(),
1752         StatigramIE(),
1753         GenericIE()
1754     ]
1755
1756 def get_info_extractor(ie_name):
1757     """Returns the info extractor class with the given ie_name"""
1758     return globals()[ie_name+'IE']