Move MTV IE into its own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.mtv import MTVIE
37 from .extractor.myvideo import MyVideoIE
38 from .extractor.nba import NBAIE
39 from .extractor.statigram import StatigramIE
40 from .extractor.photobucket import PhotobucketIE
41 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
42 from .extractor.stanfordoc import StanfordOpenClassroomIE
43 from .extractor.vimeo import VimeoIE
44 from .extractor.xvideos import XVideosIE
45 from .extractor.yahoo import YahooIE, YahooSearchIE
46 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
47 from .extractor.zdf import ZDFIE
48
49
50
51 class MixcloudIE(InfoExtractor):
52     """Information extractor for www.mixcloud.com"""
53
54     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
55     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
56     IE_NAME = u'mixcloud'
57
58     def report_download_json(self, file_id):
59         """Report JSON download."""
60         self.to_screen(u'Downloading json')
61
62     def get_urls(self, jsonData, fmt, bitrate='best'):
63         """Get urls from 'audio_formats' section in json"""
64         file_url = None
65         try:
66             bitrate_list = jsonData[fmt]
67             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
68                 bitrate = max(bitrate_list) # select highest
69
70             url_list = jsonData[fmt][bitrate]
71         except TypeError: # we have no bitrate info.
72             url_list = jsonData[fmt]
73         return url_list
74
75     def check_urls(self, url_list):
76         """Returns 1st active url from list"""
77         for url in url_list:
78             try:
79                 compat_urllib_request.urlopen(url)
80                 return url
81             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
82                 url = None
83
84         return None
85
86     def _print_formats(self, formats):
87         print('Available formats:')
88         for fmt in formats.keys():
89             for b in formats[fmt]:
90                 try:
91                     ext = formats[fmt][b][0]
92                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
93                 except TypeError: # we have no bitrate info
94                     ext = formats[fmt][0]
95                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
96                     break
97
98     def _real_extract(self, url):
99         mobj = re.match(self._VALID_URL, url)
100         if mobj is None:
101             raise ExtractorError(u'Invalid URL: %s' % url)
102         # extract uploader & filename from url
103         uploader = mobj.group(1).decode('utf-8')
104         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
105
106         # construct API request
107         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
108         # retrieve .json file with links to files
109         request = compat_urllib_request.Request(file_url)
110         try:
111             self.report_download_json(file_url)
112             jsonData = compat_urllib_request.urlopen(request).read()
113         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
114             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
115
116         # parse JSON
117         json_data = json.loads(jsonData)
118         player_url = json_data['player_swf_url']
119         formats = dict(json_data['audio_formats'])
120
121         req_format = self._downloader.params.get('format', None)
122         bitrate = None
123
124         if self._downloader.params.get('listformats', None):
125             self._print_formats(formats)
126             return
127
128         if req_format is None or req_format == 'best':
129             for format_param in formats.keys():
130                 url_list = self.get_urls(formats, format_param)
131                 # check urls
132                 file_url = self.check_urls(url_list)
133                 if file_url is not None:
134                     break # got it!
135         else:
136             if req_format not in formats:
137                 raise ExtractorError(u'Format is not available')
138
139             url_list = self.get_urls(formats, req_format)
140             file_url = self.check_urls(url_list)
141             format_param = req_format
142
143         return [{
144             'id': file_id.decode('utf-8'),
145             'url': file_url.decode('utf-8'),
146             'uploader': uploader.decode('utf-8'),
147             'upload_date': None,
148             'title': json_data['name'],
149             'ext': file_url.split('.')[-1].decode('utf-8'),
150             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
151             'thumbnail': json_data['thumbnail_url'],
152             'description': json_data['description'],
153             'player_url': player_url.decode('utf-8'),
154         }]
155
156
157
158
159 class YoukuIE(InfoExtractor):
160     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
161
162     def _gen_sid(self):
163         nowTime = int(time.time() * 1000)
164         random1 = random.randint(1000,1998)
165         random2 = random.randint(1000,9999)
166
167         return "%d%d%d" %(nowTime,random1,random2)
168
169     def _get_file_ID_mix_string(self, seed):
170         mixed = []
171         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
172         seed = float(seed)
173         for i in range(len(source)):
174             seed  =  (seed * 211 + 30031 ) % 65536
175             index  =  math.floor(seed / 65536 * len(source) )
176             mixed.append(source[int(index)])
177             source.remove(source[int(index)])
178         #return ''.join(mixed)
179         return mixed
180
181     def _get_file_id(self, fileId, seed):
182         mixed = self._get_file_ID_mix_string(seed)
183         ids = fileId.split('*')
184         realId = []
185         for ch in ids:
186             if ch:
187                 realId.append(mixed[int(ch)])
188         return ''.join(realId)
189
190     def _real_extract(self, url):
191         mobj = re.match(self._VALID_URL, url)
192         if mobj is None:
193             raise ExtractorError(u'Invalid URL: %s' % url)
194         video_id = mobj.group('ID')
195
196         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
197
198         jsondata = self._download_webpage(info_url, video_id)
199
200         self.report_extraction(video_id)
201         try:
202             config = json.loads(jsondata)
203
204             video_title =  config['data'][0]['title']
205             seed = config['data'][0]['seed']
206
207             format = self._downloader.params.get('format', None)
208             supported_format = list(config['data'][0]['streamfileids'].keys())
209
210             if format is None or format == 'best':
211                 if 'hd2' in supported_format:
212                     format = 'hd2'
213                 else:
214                     format = 'flv'
215                 ext = u'flv'
216             elif format == 'worst':
217                 format = 'mp4'
218                 ext = u'mp4'
219             else:
220                 format = 'flv'
221                 ext = u'flv'
222
223
224             fileid = config['data'][0]['streamfileids'][format]
225             keys = [s['k'] for s in config['data'][0]['segs'][format]]
226         except (UnicodeDecodeError, ValueError, KeyError):
227             raise ExtractorError(u'Unable to extract info section')
228
229         files_info=[]
230         sid = self._gen_sid()
231         fileid = self._get_file_id(fileid, seed)
232
233         #column 8,9 of fileid represent the segment number
234         #fileid[7:9] should be changed
235         for index, key in enumerate(keys):
236
237             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
238             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
239
240             info = {
241                 'id': '%s_part%02d' % (video_id, index),
242                 'url': download_url,
243                 'uploader': None,
244                 'upload_date': None,
245                 'title': video_title,
246                 'ext': ext,
247             }
248             files_info.append(info)
249
250         return files_info
251
252
253 class XNXXIE(InfoExtractor):
254     """Information extractor for xnxx.com"""
255
256     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
257     IE_NAME = u'xnxx'
258     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
259     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
260     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
261
262     def _real_extract(self, url):
263         mobj = re.match(self._VALID_URL, url)
264         if mobj is None:
265             raise ExtractorError(u'Invalid URL: %s' % url)
266         video_id = mobj.group(1)
267
268         # Get webpage content
269         webpage = self._download_webpage(url, video_id)
270
271         video_url = self._search_regex(self.VIDEO_URL_RE,
272             webpage, u'video URL')
273         video_url = compat_urllib_parse.unquote(video_url)
274
275         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
276             webpage, u'title')
277
278         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
279             webpage, u'thumbnail', fatal=False)
280
281         return [{
282             'id': video_id,
283             'url': video_url,
284             'uploader': None,
285             'upload_date': None,
286             'title': video_title,
287             'ext': 'flv',
288             'thumbnail': video_thumbnail,
289             'description': None,
290         }]
291
292
293
294
295 class JustinTVIE(InfoExtractor):
296     """Information extractor for justin.tv and twitch.tv"""
297     # TODO: One broadcast may be split into multiple videos. The key
298     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
299     # starts at 1 and increases. Can we treat all parts as one video?
300
301     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
302         (?:
303             (?P<channelid>[^/]+)|
304             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
305             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
306         )
307         /?(?:\#.*)?$
308         """
309     _JUSTIN_PAGE_LIMIT = 100
310     IE_NAME = u'justin.tv'
311
312     def report_download_page(self, channel, offset):
313         """Report attempt to download a single page of videos."""
314         self.to_screen(u'%s: Downloading video information from %d to %d' %
315                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
316
317     # Return count of items, list of *valid* items
318     def _parse_page(self, url, video_id):
319         webpage = self._download_webpage(url, video_id,
320                                          u'Downloading video info JSON',
321                                          u'unable to download video info JSON')
322
323         response = json.loads(webpage)
324         if type(response) != list:
325             error_text = response.get('error', 'unknown error')
326             raise ExtractorError(u'Justin.tv API: %s' % error_text)
327         info = []
328         for clip in response:
329             video_url = clip['video_file_url']
330             if video_url:
331                 video_extension = os.path.splitext(video_url)[1][1:]
332                 video_date = re.sub('-', '', clip['start_time'][:10])
333                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
334                 video_id = clip['id']
335                 video_title = clip.get('title', video_id)
336                 info.append({
337                     'id': video_id,
338                     'url': video_url,
339                     'title': video_title,
340                     'uploader': clip.get('channel_name', video_uploader_id),
341                     'uploader_id': video_uploader_id,
342                     'upload_date': video_date,
343                     'ext': video_extension,
344                 })
345         return (len(response), info)
346
347     def _real_extract(self, url):
348         mobj = re.match(self._VALID_URL, url)
349         if mobj is None:
350             raise ExtractorError(u'invalid URL: %s' % url)
351
352         api_base = 'http://api.justin.tv'
353         paged = False
354         if mobj.group('channelid'):
355             paged = True
356             video_id = mobj.group('channelid')
357             api = api_base + '/channel/archives/%s.json' % video_id
358         elif mobj.group('chapterid'):
359             chapter_id = mobj.group('chapterid')
360
361             webpage = self._download_webpage(url, chapter_id)
362             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
363             if not m:
364                 raise ExtractorError(u'Cannot find archive of a chapter')
365             archive_id = m.group(1)
366
367             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
368             chapter_info_xml = self._download_webpage(api, chapter_id,
369                                              note=u'Downloading chapter information',
370                                              errnote=u'Chapter information download failed')
371             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
372             for a in doc.findall('.//archive'):
373                 if archive_id == a.find('./id').text:
374                     break
375             else:
376                 raise ExtractorError(u'Could not find chapter in chapter information')
377
378             video_url = a.find('./video_file_url').text
379             video_ext = video_url.rpartition('.')[2] or u'flv'
380
381             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
382             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
383                                    note='Downloading chapter metadata',
384                                    errnote='Download of chapter metadata failed')
385             chapter_info = json.loads(chapter_info_json)
386
387             bracket_start = int(doc.find('.//bracket_start').text)
388             bracket_end = int(doc.find('.//bracket_end').text)
389
390             # TODO determine start (and probably fix up file)
391             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
392             #video_url += u'?start=' + TODO:start_timestamp
393             # bracket_start is 13290, but we want 51670615
394             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
395                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
396
397             info = {
398                 'id': u'c' + chapter_id,
399                 'url': video_url,
400                 'ext': video_ext,
401                 'title': chapter_info['title'],
402                 'thumbnail': chapter_info['preview'],
403                 'description': chapter_info['description'],
404                 'uploader': chapter_info['channel']['display_name'],
405                 'uploader_id': chapter_info['channel']['name'],
406             }
407             return [info]
408         else:
409             video_id = mobj.group('videoid')
410             api = api_base + '/broadcast/by_archive/%s.json' % video_id
411
412         self.report_extraction(video_id)
413
414         info = []
415         offset = 0
416         limit = self._JUSTIN_PAGE_LIMIT
417         while True:
418             if paged:
419                 self.report_download_page(video_id, offset)
420             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
421             page_count, page_info = self._parse_page(page_url, video_id)
422             info.extend(page_info)
423             if not paged or page_count != limit:
424                 break
425             offset += limit
426         return info
427
428 class FunnyOrDieIE(InfoExtractor):
429     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
430
431     def _real_extract(self, url):
432         mobj = re.match(self._VALID_URL, url)
433         if mobj is None:
434             raise ExtractorError(u'invalid URL: %s' % url)
435
436         video_id = mobj.group('id')
437         webpage = self._download_webpage(url, video_id)
438
439         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
440             webpage, u'video URL', flags=re.DOTALL)
441
442         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
443             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
444
445         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
446             webpage, u'description', fatal=False, flags=re.DOTALL)
447
448         info = {
449             'id': video_id,
450             'url': video_url,
451             'ext': 'mp4',
452             'title': title,
453             'description': video_description,
454         }
455         return [info]
456
457 class SteamIE(InfoExtractor):
458     _VALID_URL = r"""http://store\.steampowered\.com/
459                 (agecheck/)?
460                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
461                 (?P<gameID>\d+)/?
462                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
463                 """
464     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
465     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
466
467     @classmethod
468     def suitable(cls, url):
469         """Receives a URL and returns True if suitable for this IE."""
470         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
471
472     def _real_extract(self, url):
473         m = re.match(self._VALID_URL, url, re.VERBOSE)
474         gameID = m.group('gameID')
475
476         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
477         webpage = self._download_webpage(videourl, gameID)
478
479         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
480             videourl = self._AGECHECK_TEMPLATE % gameID
481             self.report_age_confirmation()
482             webpage = self._download_webpage(videourl, gameID)
483
484         self.report_extraction(gameID)
485         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
486                                              webpage, 'game title')
487
488         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
489         mweb = re.finditer(urlRE, webpage)
490         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
491         titles = re.finditer(namesRE, webpage)
492         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
493         thumbs = re.finditer(thumbsRE, webpage)
494         videos = []
495         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
496             video_id = vid.group('videoID')
497             title = vtitle.group('videoName')
498             video_url = vid.group('videoURL')
499             video_thumb = thumb.group('thumbnail')
500             if not video_url:
501                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
502             info = {
503                 'id':video_id,
504                 'url':video_url,
505                 'ext': 'flv',
506                 'title': unescapeHTML(title),
507                 'thumbnail': video_thumb
508                   }
509             videos.append(info)
510         return [self.playlist_result(videos, gameID, game_title)]
511
512 class UstreamIE(InfoExtractor):
513     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
514     IE_NAME = u'ustream'
515
516     def _real_extract(self, url):
517         m = re.match(self._VALID_URL, url)
518         video_id = m.group('videoID')
519
520         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
521         webpage = self._download_webpage(url, video_id)
522
523         self.report_extraction(video_id)
524
525         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
526             webpage, u'title')
527
528         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
529             webpage, u'uploader', fatal=False, flags=re.DOTALL)
530
531         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
532             webpage, u'thumbnail', fatal=False)
533
534         info = {
535                 'id': video_id,
536                 'url': video_url,
537                 'ext': 'flv',
538                 'title': video_title,
539                 'uploader': uploader,
540                 'thumbnail': thumbnail,
541                }
542         return info
543
544 class WorldStarHipHopIE(InfoExtractor):
545     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
546     IE_NAME = u'WorldStarHipHop'
547
548     def _real_extract(self, url):
549         m = re.match(self._VALID_URL, url)
550         video_id = m.group('id')
551
552         webpage_src = self._download_webpage(url, video_id)
553
554         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
555             webpage_src, u'video URL')
556
557         if 'mp4' in video_url:
558             ext = 'mp4'
559         else:
560             ext = 'flv'
561
562         video_title = self._html_search_regex(r"<title>(.*)</title>",
563             webpage_src, u'title')
564
565         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
566         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
567             webpage_src, u'thumbnail', fatal=False)
568
569         if not thumbnail:
570             _title = r"""candytitles.*>(.*)</span>"""
571             mobj = re.search(_title, webpage_src)
572             if mobj is not None:
573                 video_title = mobj.group(1)
574
575         results = [{
576                     'id': video_id,
577                     'url' : video_url,
578                     'title' : video_title,
579                     'thumbnail' : thumbnail,
580                     'ext' : ext,
581                     }]
582         return results
583
584 class RBMARadioIE(InfoExtractor):
585     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
586
587     def _real_extract(self, url):
588         m = re.match(self._VALID_URL, url)
589         video_id = m.group('videoID')
590
591         webpage = self._download_webpage(url, video_id)
592
593         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
594             webpage, u'json data', flags=re.MULTILINE)
595
596         try:
597             data = json.loads(json_data)
598         except ValueError as e:
599             raise ExtractorError(u'Invalid JSON: ' + str(e))
600
601         video_url = data['akamai_url'] + '&cbr=256'
602         url_parts = compat_urllib_parse_urlparse(video_url)
603         video_ext = url_parts.path.rpartition('.')[2]
604         info = {
605                 'id': video_id,
606                 'url': video_url,
607                 'ext': video_ext,
608                 'title': data['title'],
609                 'description': data.get('teaser_text'),
610                 'location': data.get('country_of_origin'),
611                 'uploader': data.get('host', {}).get('name'),
612                 'uploader_id': data.get('host', {}).get('slug'),
613                 'thumbnail': data.get('image', {}).get('large_url_2x'),
614                 'duration': data.get('duration'),
615         }
616         return [info]
617
618
619 class YouPornIE(InfoExtractor):
620     """Information extractor for youporn.com."""
621     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
622
623     def _print_formats(self, formats):
624         """Print all available formats"""
625         print(u'Available formats:')
626         print(u'ext\t\tformat')
627         print(u'---------------------------------')
628         for format in formats:
629             print(u'%s\t\t%s'  % (format['ext'], format['format']))
630
631     def _specific(self, req_format, formats):
632         for x in formats:
633             if(x["format"]==req_format):
634                 return x
635         return None
636
637     def _real_extract(self, url):
638         mobj = re.match(self._VALID_URL, url)
639         if mobj is None:
640             raise ExtractorError(u'Invalid URL: %s' % url)
641         video_id = mobj.group('videoid')
642
643         req = compat_urllib_request.Request(url)
644         req.add_header('Cookie', 'age_verified=1')
645         webpage = self._download_webpage(req, video_id)
646
647         # Get JSON parameters
648         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
649         try:
650             params = json.loads(json_params)
651         except:
652             raise ExtractorError(u'Invalid JSON')
653
654         self.report_extraction(video_id)
655         try:
656             video_title = params['title']
657             upload_date = unified_strdate(params['release_date_f'])
658             video_description = params['description']
659             video_uploader = params['submitted_by']
660             thumbnail = params['thumbnails'][0]['image']
661         except KeyError:
662             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
663
664         # Get all of the formats available
665         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
666         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
667             webpage, u'download list').strip()
668
669         # Get all of the links from the page
670         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
671         links = re.findall(LINK_RE, download_list_html)
672         if(len(links) == 0):
673             raise ExtractorError(u'ERROR: no known formats available for video')
674
675         self.to_screen(u'Links found: %d' % len(links))
676
677         formats = []
678         for link in links:
679
680             # A link looks like this:
681             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
682             # A path looks like this:
683             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
684             video_url = unescapeHTML( link )
685             path = compat_urllib_parse_urlparse( video_url ).path
686             extension = os.path.splitext( path )[1][1:]
687             format = path.split('/')[4].split('_')[:2]
688             size = format[0]
689             bitrate = format[1]
690             format = "-".join( format )
691             # title = u'%s-%s-%s' % (video_title, size, bitrate)
692
693             formats.append({
694                 'id': video_id,
695                 'url': video_url,
696                 'uploader': video_uploader,
697                 'upload_date': upload_date,
698                 'title': video_title,
699                 'ext': extension,
700                 'format': format,
701                 'thumbnail': thumbnail,
702                 'description': video_description
703             })
704
705         if self._downloader.params.get('listformats', None):
706             self._print_formats(formats)
707             return
708
709         req_format = self._downloader.params.get('format', None)
710         self.to_screen(u'Format: %s' % req_format)
711
712         if req_format is None or req_format == 'best':
713             return [formats[0]]
714         elif req_format == 'worst':
715             return [formats[-1]]
716         elif req_format in ('-1', 'all'):
717             return formats
718         else:
719             format = self._specific( req_format, formats )
720             if result is None:
721                 raise ExtractorError(u'Requested format not available')
722             return [format]
723
724
725
726 class PornotubeIE(InfoExtractor):
727     """Information extractor for pornotube.com."""
728     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
729
730     def _real_extract(self, url):
731         mobj = re.match(self._VALID_URL, url)
732         if mobj is None:
733             raise ExtractorError(u'Invalid URL: %s' % url)
734
735         video_id = mobj.group('videoid')
736         video_title = mobj.group('title')
737
738         # Get webpage content
739         webpage = self._download_webpage(url, video_id)
740
741         # Get the video URL
742         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
743         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
744         video_url = compat_urllib_parse.unquote(video_url)
745
746         #Get the uploaded date
747         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
748         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
749         if upload_date: upload_date = unified_strdate(upload_date)
750
751         info = {'id': video_id,
752                 'url': video_url,
753                 'uploader': None,
754                 'upload_date': upload_date,
755                 'title': video_title,
756                 'ext': 'flv',
757                 'format': 'flv'}
758
759         return [info]
760
761 class YouJizzIE(InfoExtractor):
762     """Information extractor for youjizz.com."""
763     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
764
765     def _real_extract(self, url):
766         mobj = re.match(self._VALID_URL, url)
767         if mobj is None:
768             raise ExtractorError(u'Invalid URL: %s' % url)
769
770         video_id = mobj.group('videoid')
771
772         # Get webpage content
773         webpage = self._download_webpage(url, video_id)
774
775         # Get the video title
776         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
777             webpage, u'title').strip()
778
779         # Get the embed page
780         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
781         if result is None:
782             raise ExtractorError(u'ERROR: unable to extract embed page')
783
784         embed_page_url = result.group(0).strip()
785         video_id = result.group('videoid')
786
787         webpage = self._download_webpage(embed_page_url, video_id)
788
789         # Get the video URL
790         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
791             webpage, u'video URL')
792
793         info = {'id': video_id,
794                 'url': video_url,
795                 'title': video_title,
796                 'ext': 'flv',
797                 'format': 'flv',
798                 'player_url': embed_page_url}
799
800         return [info]
801
802 class EightTracksIE(InfoExtractor):
803     IE_NAME = '8tracks'
804     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
805
806     def _real_extract(self, url):
807         mobj = re.match(self._VALID_URL, url)
808         if mobj is None:
809             raise ExtractorError(u'Invalid URL: %s' % url)
810         playlist_id = mobj.group('id')
811
812         webpage = self._download_webpage(url, playlist_id)
813
814         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
815         data = json.loads(json_like)
816
817         session = str(random.randint(0, 1000000000))
818         mix_id = data['id']
819         track_count = data['tracks_count']
820         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
821         next_url = first_url
822         res = []
823         for i in itertools.count():
824             api_json = self._download_webpage(next_url, playlist_id,
825                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
826                 errnote=u'Failed to download song information')
827             api_data = json.loads(api_json)
828             track_data = api_data[u'set']['track']
829             info = {
830                 'id': track_data['id'],
831                 'url': track_data['track_file_stream_url'],
832                 'title': track_data['performer'] + u' - ' + track_data['name'],
833                 'raw_title': track_data['name'],
834                 'uploader_id': data['user']['login'],
835                 'ext': 'm4a',
836             }
837             res.append(info)
838             if api_data['set']['at_last_track']:
839                 break
840             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
841         return res
842
843 class KeekIE(InfoExtractor):
844     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
845     IE_NAME = u'keek'
846
847     def _real_extract(self, url):
848         m = re.match(self._VALID_URL, url)
849         video_id = m.group('videoID')
850
851         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
852         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
853         webpage = self._download_webpage(url, video_id)
854
855         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
856             webpage, u'title')
857
858         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
859             webpage, u'uploader', fatal=False)
860
861         info = {
862                 'id': video_id,
863                 'url': video_url,
864                 'ext': 'mp4',
865                 'title': video_title,
866                 'thumbnail': thumbnail,
867                 'uploader': uploader
868         }
869         return [info]
870
871 class TEDIE(InfoExtractor):
872     _VALID_URL=r'''http://www\.ted\.com/
873                    (
874                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
875                         |
876                         ((?P<type_talk>talks)) # We have a simple talk
877                    )
878                    (/lang/(.*?))? # The url may contain the language
879                    /(?P<name>\w+) # Here goes the name and then ".html"
880                    '''
881
882     @classmethod
883     def suitable(cls, url):
884         """Receives a URL and returns True if suitable for this IE."""
885         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
886
887     def _real_extract(self, url):
888         m=re.match(self._VALID_URL, url, re.VERBOSE)
889         if m.group('type_talk'):
890             return [self._talk_info(url)]
891         else :
892             playlist_id=m.group('playlist_id')
893             name=m.group('name')
894             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
895             return [self._playlist_videos_info(url,name,playlist_id)]
896
897     def _playlist_videos_info(self,url,name,playlist_id=0):
898         '''Returns the videos of the playlist'''
899         video_RE=r'''
900                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
901                      ([.\s]*?)data-playlist_item_id="(\d+)"
902                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
903                      '''
904         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
905         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
906         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
907         m_names=re.finditer(video_name_RE,webpage)
908
909         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
910                                                  webpage, 'playlist title')
911
912         playlist_entries = []
913         for m_video, m_name in zip(m_videos,m_names):
914             video_id=m_video.group('video_id')
915             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
916             playlist_entries.append(self.url_result(talk_url, 'TED'))
917         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
918
919     def _talk_info(self, url, video_id=0):
920         """Return the video for the talk in the url"""
921         m = re.match(self._VALID_URL, url,re.VERBOSE)
922         video_name = m.group('name')
923         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
924         self.report_extraction(video_name)
925         # If the url includes the language we get the title translated
926         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
927                                         webpage, 'title')
928         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
929                                     webpage, 'json data')
930         info = json.loads(json_data)
931         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
932                                        webpage, 'description', flags = re.DOTALL)
933         
934         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
935                                        webpage, 'thumbnail')
936         info = {
937                 'id': info['id'],
938                 'url': info['htmlStreams'][-1]['file'],
939                 'ext': 'mp4',
940                 'title': title,
941                 'thumbnail': thumbnail,
942                 'description': desc,
943                 }
944         return info
945
946 class MySpassIE(InfoExtractor):
947     _VALID_URL = r'http://www.myspass.de/.*'
948
949     def _real_extract(self, url):
950         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
951
952         # video id is the last path element of the URL
953         # usually there is a trailing slash, so also try the second but last
954         url_path = compat_urllib_parse_urlparse(url).path
955         url_parent_path, video_id = os.path.split(url_path)
956         if not video_id:
957             _, video_id = os.path.split(url_parent_path)
958
959         # get metadata
960         metadata_url = META_DATA_URL_TEMPLATE % video_id
961         metadata_text = self._download_webpage(metadata_url, video_id)
962         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
963
964         # extract values from metadata
965         url_flv_el = metadata.find('url_flv')
966         if url_flv_el is None:
967             raise ExtractorError(u'Unable to extract download url')
968         video_url = url_flv_el.text
969         extension = os.path.splitext(video_url)[1][1:]
970         title_el = metadata.find('title')
971         if title_el is None:
972             raise ExtractorError(u'Unable to extract title')
973         title = title_el.text
974         format_id_el = metadata.find('format_id')
975         if format_id_el is None:
976             format = ext
977         else:
978             format = format_id_el.text
979         description_el = metadata.find('description')
980         if description_el is not None:
981             description = description_el.text
982         else:
983             description = None
984         imagePreview_el = metadata.find('imagePreview')
985         if imagePreview_el is not None:
986             thumbnail = imagePreview_el.text
987         else:
988             thumbnail = None
989         info = {
990             'id': video_id,
991             'url': video_url,
992             'title': title,
993             'ext': extension,
994             'format': format,
995             'thumbnail': thumbnail,
996             'description': description
997         }
998         return [info]
999
1000 class SpiegelIE(InfoExtractor):
1001     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1002
1003     def _real_extract(self, url):
1004         m = re.match(self._VALID_URL, url)
1005         video_id = m.group('videoID')
1006
1007         webpage = self._download_webpage(url, video_id)
1008
1009         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1010             webpage, u'title')
1011
1012         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1013         xml_code = self._download_webpage(xml_url, video_id,
1014                     note=u'Downloading XML', errnote=u'Failed to download XML')
1015
1016         idoc = xml.etree.ElementTree.fromstring(xml_code)
1017         last_type = idoc[-1]
1018         filename = last_type.findall('./filename')[0].text
1019         duration = float(last_type.findall('./duration')[0].text)
1020
1021         video_url = 'http://video2.spiegel.de/flash/' + filename
1022         video_ext = filename.rpartition('.')[2]
1023         info = {
1024             'id': video_id,
1025             'url': video_url,
1026             'ext': video_ext,
1027             'title': video_title,
1028             'duration': duration,
1029         }
1030         return [info]
1031
1032 class LiveLeakIE(InfoExtractor):
1033
1034     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1035     IE_NAME = u'liveleak'
1036
1037     def _real_extract(self, url):
1038         mobj = re.match(self._VALID_URL, url)
1039         if mobj is None:
1040             raise ExtractorError(u'Invalid URL: %s' % url)
1041
1042         video_id = mobj.group('video_id')
1043
1044         webpage = self._download_webpage(url, video_id)
1045
1046         video_url = self._search_regex(r'file: "(.*?)",',
1047             webpage, u'video URL')
1048
1049         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1050             webpage, u'title').replace('LiveLeak.com -', '').strip()
1051
1052         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1053             webpage, u'description', fatal=False)
1054
1055         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1056             webpage, u'uploader', fatal=False)
1057
1058         info = {
1059             'id':  video_id,
1060             'url': video_url,
1061             'ext': 'mp4',
1062             'title': video_title,
1063             'description': video_description,
1064             'uploader': video_uploader
1065         }
1066
1067         return [info]
1068
1069
1070
1071 class TumblrIE(InfoExtractor):
1072     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1073
1074     def _real_extract(self, url):
1075         m_url = re.match(self._VALID_URL, url)
1076         video_id = m_url.group('id')
1077         blog = m_url.group('blog_name')
1078
1079         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1080         webpage = self._download_webpage(url, video_id)
1081
1082         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1083         video = re.search(re_video, webpage)
1084         if video is None:
1085            raise ExtractorError(u'Unable to extract video')
1086         video_url = video.group('video_url')
1087         ext = video.group('ext')
1088
1089         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1090             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1091         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1092
1093         # The only place where you can get a title, it's not complete,
1094         # but searching in other places doesn't work for all videos
1095         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1096             webpage, u'title', flags=re.DOTALL)
1097
1098         return [{'id': video_id,
1099                  'url': video_url,
1100                  'title': video_title,
1101                  'thumbnail': video_thumbnail,
1102                  'ext': ext
1103                  }]
1104
1105 class BandcampIE(InfoExtractor):
1106     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1107
1108     def _real_extract(self, url):
1109         mobj = re.match(self._VALID_URL, url)
1110         title = mobj.group('title')
1111         webpage = self._download_webpage(url, title)
1112         # We get the link to the free download page
1113         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1114         if m_download is None:
1115             raise ExtractorError(u'No free songs found')
1116
1117         download_link = m_download.group(1)
1118         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
1119                        webpage, re.MULTILINE|re.DOTALL).group('id')
1120
1121         download_webpage = self._download_webpage(download_link, id,
1122                                                   'Downloading free downloads page')
1123         # We get the dictionary of the track from some javascrip code
1124         info = re.search(r'items: (.*?),$',
1125                          download_webpage, re.MULTILINE).group(1)
1126         info = json.loads(info)[0]
1127         # We pick mp3-320 for now, until format selection can be easily implemented.
1128         mp3_info = info[u'downloads'][u'mp3-320']
1129         # If we try to use this url it says the link has expired
1130         initial_url = mp3_info[u'url']
1131         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1132         m_url = re.match(re_url, initial_url)
1133         #We build the url we will use to get the final track url
1134         # This url is build in Bandcamp in the script download_bunde_*.js
1135         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1136         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1137         # If we could correctly generate the .rand field the url would be
1138         #in the "download_url" key
1139         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1140
1141         track_info = {'id':id,
1142                       'title' : info[u'title'],
1143                       'ext' :   'mp3',
1144                       'url' :   final_url,
1145                       'thumbnail' : info[u'thumb_url'],
1146                       'uploader' :  info[u'artist']
1147                       }
1148
1149         return [track_info]
1150
1151 class RedTubeIE(InfoExtractor):
1152     """Information Extractor for redtube"""
1153     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1154
1155     def _real_extract(self,url):
1156         mobj = re.match(self._VALID_URL, url)
1157         if mobj is None:
1158             raise ExtractorError(u'Invalid URL: %s' % url)
1159
1160         video_id = mobj.group('id')
1161         video_extension = 'mp4'        
1162         webpage = self._download_webpage(url, video_id)
1163
1164         self.report_extraction(video_id)
1165
1166         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1167             webpage, u'video URL')
1168
1169         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1170             webpage, u'title')
1171
1172         return [{
1173             'id':       video_id,
1174             'url':      video_url,
1175             'ext':      video_extension,
1176             'title':    video_title,
1177         }]
1178         
1179 class InaIE(InfoExtractor):
1180     """Information Extractor for Ina.fr"""
1181     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1182
1183     def _real_extract(self,url):
1184         mobj = re.match(self._VALID_URL, url)
1185
1186         video_id = mobj.group('id')
1187         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1188         video_extension = 'mp4'
1189         webpage = self._download_webpage(mrss_url, video_id)
1190
1191         self.report_extraction(video_id)
1192
1193         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1194             webpage, u'video URL')
1195
1196         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1197             webpage, u'title')
1198
1199         return [{
1200             'id':       video_id,
1201             'url':      video_url,
1202             'ext':      video_extension,
1203             'title':    video_title,
1204         }]
1205
1206 class HowcastIE(InfoExtractor):
1207     """Information Extractor for Howcast.com"""
1208     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1209
1210     def _real_extract(self, url):
1211         mobj = re.match(self._VALID_URL, url)
1212
1213         video_id = mobj.group('id')
1214         webpage_url = 'http://www.howcast.com/videos/' + video_id
1215         webpage = self._download_webpage(webpage_url, video_id)
1216
1217         self.report_extraction(video_id)
1218
1219         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1220             webpage, u'video URL')
1221
1222         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1223             webpage, u'title')
1224
1225         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1226             webpage, u'description', fatal=False)
1227
1228         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1229             webpage, u'thumbnail', fatal=False)
1230
1231         return [{
1232             'id':       video_id,
1233             'url':      video_url,
1234             'ext':      'mp4',
1235             'title':    video_title,
1236             'description': video_description,
1237             'thumbnail': thumbnail,
1238         }]
1239
1240 class VineIE(InfoExtractor):
1241     """Information Extractor for Vine.co"""
1242     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1243
1244     def _real_extract(self, url):
1245         mobj = re.match(self._VALID_URL, url)
1246
1247         video_id = mobj.group('id')
1248         webpage_url = 'https://vine.co/v/' + video_id
1249         webpage = self._download_webpage(webpage_url, video_id)
1250
1251         self.report_extraction(video_id)
1252
1253         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1254             webpage, u'video URL')
1255
1256         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1257             webpage, u'title')
1258
1259         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1260             webpage, u'thumbnail', fatal=False)
1261
1262         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1263             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1264
1265         return [{
1266             'id':        video_id,
1267             'url':       video_url,
1268             'ext':       'mp4',
1269             'title':     video_title,
1270             'thumbnail': thumbnail,
1271             'uploader':  uploader,
1272         }]
1273
1274 class FlickrIE(InfoExtractor):
1275     """Information Extractor for Flickr videos"""
1276     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1277
1278     def _real_extract(self, url):
1279         mobj = re.match(self._VALID_URL, url)
1280
1281         video_id = mobj.group('id')
1282         video_uploader_id = mobj.group('uploader_id')
1283         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1284         webpage = self._download_webpage(webpage_url, video_id)
1285
1286         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1287
1288         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1289         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1290
1291         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1292             first_xml, u'node_id')
1293
1294         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1295         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1296
1297         self.report_extraction(video_id)
1298
1299         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1300         if mobj is None:
1301             raise ExtractorError(u'Unable to extract video url')
1302         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1303
1304         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1305             webpage, u'video title')
1306
1307         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1308             webpage, u'description', fatal=False)
1309
1310         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1311             webpage, u'thumbnail', fatal=False)
1312
1313         return [{
1314             'id':          video_id,
1315             'url':         video_url,
1316             'ext':         'mp4',
1317             'title':       video_title,
1318             'description': video_description,
1319             'thumbnail':   thumbnail,
1320             'uploader_id': video_uploader_id,
1321         }]
1322
1323 class TeamcocoIE(InfoExtractor):
1324     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1325
1326     def _real_extract(self, url):
1327         mobj = re.match(self._VALID_URL, url)
1328         if mobj is None:
1329             raise ExtractorError(u'Invalid URL: %s' % url)
1330         url_title = mobj.group('url_title')
1331         webpage = self._download_webpage(url, url_title)
1332
1333         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1334             webpage, u'video id')
1335
1336         self.report_extraction(video_id)
1337
1338         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1339             webpage, u'title')
1340
1341         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1342             webpage, u'thumbnail', fatal=False)
1343
1344         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1345             webpage, u'description', fatal=False)
1346
1347         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1348         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1349
1350         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1351             data, u'video URL')
1352
1353         return [{
1354             'id':          video_id,
1355             'url':         video_url,
1356             'ext':         'mp4',
1357             'title':       video_title,
1358             'thumbnail':   thumbnail,
1359             'description': video_description,
1360         }]
1361
1362 class XHamsterIE(InfoExtractor):
1363     """Information Extractor for xHamster"""
1364     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1365
1366     def _real_extract(self,url):
1367         mobj = re.match(self._VALID_URL, url)
1368
1369         video_id = mobj.group('id')
1370         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1371         webpage = self._download_webpage(mrss_url, video_id)
1372
1373         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1374         if mobj is None:
1375             raise ExtractorError(u'Unable to extract media URL')
1376         if len(mobj.group('server')) == 0:
1377             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1378         else:
1379             video_url = mobj.group('server')+'/key='+mobj.group('file')
1380         video_extension = video_url.split('.')[-1]
1381
1382         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1383             webpage, u'title')
1384
1385         # Can't see the description anywhere in the UI
1386         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1387         #     webpage, u'description', fatal=False)
1388         # if video_description: video_description = unescapeHTML(video_description)
1389
1390         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1391         if mobj:
1392             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1393         else:
1394             video_upload_date = None
1395             self._downloader.report_warning(u'Unable to extract upload date')
1396
1397         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1398             webpage, u'uploader id', default=u'anonymous')
1399
1400         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1401             webpage, u'thumbnail', fatal=False)
1402
1403         return [{
1404             'id':       video_id,
1405             'url':      video_url,
1406             'ext':      video_extension,
1407             'title':    video_title,
1408             # 'description': video_description,
1409             'upload_date': video_upload_date,
1410             'uploader_id': video_uploader_id,
1411             'thumbnail': video_thumbnail
1412         }]
1413
1414 class HypemIE(InfoExtractor):
1415     """Information Extractor for hypem"""
1416     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1417
1418     def _real_extract(self, url):
1419         mobj = re.match(self._VALID_URL, url)
1420         if mobj is None:
1421             raise ExtractorError(u'Invalid URL: %s' % url)
1422         track_id = mobj.group(1)
1423
1424         data = { 'ax': 1, 'ts': time.time() }
1425         data_encoded = compat_urllib_parse.urlencode(data)
1426         complete_url = url + "?" + data_encoded
1427         request = compat_urllib_request.Request(complete_url)
1428         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1429         cookie = urlh.headers.get('Set-Cookie', '')
1430
1431         self.report_extraction(track_id)
1432
1433         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1434             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1435         try:
1436             track_list = json.loads(html_tracks)
1437             track = track_list[u'tracks'][0]
1438         except ValueError:
1439             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1440
1441         key = track[u"key"]
1442         track_id = track[u"id"]
1443         artist = track[u"artist"]
1444         title = track[u"song"]
1445
1446         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1447         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1448         request.add_header('cookie', cookie)
1449         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1450         try:
1451             song_data = json.loads(song_data_json)
1452         except ValueError:
1453             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1454         final_url = song_data[u"url"]
1455
1456         return [{
1457             'id':       track_id,
1458             'url':      final_url,
1459             'ext':      "mp3",
1460             'title':    title,
1461             'artist':   artist,
1462         }]
1463
1464 class Vbox7IE(InfoExtractor):
1465     """Information Extractor for Vbox7"""
1466     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1467
1468     def _real_extract(self,url):
1469         mobj = re.match(self._VALID_URL, url)
1470         if mobj is None:
1471             raise ExtractorError(u'Invalid URL: %s' % url)
1472         video_id = mobj.group(1)
1473
1474         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1475         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1476         redirect_url = urlh.geturl() + new_location
1477         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1478
1479         title = self._html_search_regex(r'<title>(.*)</title>',
1480             webpage, u'title').split('/')[0].strip()
1481
1482         ext = "flv"
1483         info_url = "http://vbox7.com/play/magare.do"
1484         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1485         info_request = compat_urllib_request.Request(info_url, data)
1486         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1487         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1488         if info_response is None:
1489             raise ExtractorError(u'Unable to extract the media url')
1490         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1491
1492         return [{
1493             'id':        video_id,
1494             'url':       final_url,
1495             'ext':       ext,
1496             'title':     title,
1497             'thumbnail': thumbnail_url,
1498         }]
1499
1500
1501 def gen_extractors():
1502     """ Return a list of an instance of every supported extractor.
1503     The order does matter; the first extractor matched is the one handling the URL.
1504     """
1505     return [
1506         YoutubePlaylistIE(),
1507         YoutubeChannelIE(),
1508         YoutubeUserIE(),
1509         YoutubeSearchIE(),
1510         YoutubeIE(),
1511         MetacafeIE(),
1512         DailymotionIE(),
1513         GoogleSearchIE(),
1514         PhotobucketIE(),
1515         YahooIE(),
1516         YahooSearchIE(),
1517         DepositFilesIE(),
1518         FacebookIE(),
1519         BlipTVIE(),
1520         BlipTVUserIE(),
1521         VimeoIE(),
1522         MyVideoIE(),
1523         ComedyCentralIE(),
1524         EscapistIE(),
1525         CollegeHumorIE(),
1526         XVideosIE(),
1527         SoundcloudSetIE(),
1528         SoundcloudIE(),
1529         InfoQIE(),
1530         MixcloudIE(),
1531         StanfordOpenClassroomIE(),
1532         MTVIE(),
1533         YoukuIE(),
1534         XNXXIE(),
1535         YouJizzIE(),
1536         PornotubeIE(),
1537         YouPornIE(),
1538         GooglePlusIE(),
1539         ArteTvIE(),
1540         NBAIE(),
1541         WorldStarHipHopIE(),
1542         JustinTVIE(),
1543         FunnyOrDieIE(),
1544         SteamIE(),
1545         UstreamIE(),
1546         RBMARadioIE(),
1547         EightTracksIE(),
1548         KeekIE(),
1549         TEDIE(),
1550         MySpassIE(),
1551         SpiegelIE(),
1552         LiveLeakIE(),
1553         ARDIE(),
1554         ZDFIE(),
1555         TumblrIE(),
1556         BandcampIE(),
1557         RedTubeIE(),
1558         InaIE(),
1559         HowcastIE(),
1560         VineIE(),
1561         FlickrIE(),
1562         TeamcocoIE(),
1563         XHamsterIE(),
1564         HypemIE(),
1565         Vbox7IE(),
1566         GametrailersIE(),
1567         StatigramIE(),
1568         GenericIE()
1569     ]
1570
1571 def get_info_extractor(ie_name):
1572     """Returns the info extractor class with the given ie_name"""
1573     return globals()[ie_name+'IE']