Move TED IE into its own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.mtv import MTVIE
37 from .extractor.myvideo import MyVideoIE
38 from .extractor.nba import NBAIE
39 from .extractor.statigram import StatigramIE
40 from .extractor.photobucket import PhotobucketIE
41 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
42 from .extractor.stanfordoc import StanfordOpenClassroomIE
43 from .extractor.ted import TEDIE
44 from .extractor.vimeo import VimeoIE
45 from .extractor.xvideos import XVideosIE
46 from .extractor.yahoo import YahooIE, YahooSearchIE
47 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
48 from .extractor.zdf import ZDFIE
49
50
51
52 class MixcloudIE(InfoExtractor):
53     """Information extractor for www.mixcloud.com"""
54
55     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
56     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
57     IE_NAME = u'mixcloud'
58
59     def report_download_json(self, file_id):
60         """Report JSON download."""
61         self.to_screen(u'Downloading json')
62
63     def get_urls(self, jsonData, fmt, bitrate='best'):
64         """Get urls from 'audio_formats' section in json"""
65         file_url = None
66         try:
67             bitrate_list = jsonData[fmt]
68             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
69                 bitrate = max(bitrate_list) # select highest
70
71             url_list = jsonData[fmt][bitrate]
72         except TypeError: # we have no bitrate info.
73             url_list = jsonData[fmt]
74         return url_list
75
76     def check_urls(self, url_list):
77         """Returns 1st active url from list"""
78         for url in url_list:
79             try:
80                 compat_urllib_request.urlopen(url)
81                 return url
82             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
83                 url = None
84
85         return None
86
87     def _print_formats(self, formats):
88         print('Available formats:')
89         for fmt in formats.keys():
90             for b in formats[fmt]:
91                 try:
92                     ext = formats[fmt][b][0]
93                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
94                 except TypeError: # we have no bitrate info
95                     ext = formats[fmt][0]
96                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
97                     break
98
99     def _real_extract(self, url):
100         mobj = re.match(self._VALID_URL, url)
101         if mobj is None:
102             raise ExtractorError(u'Invalid URL: %s' % url)
103         # extract uploader & filename from url
104         uploader = mobj.group(1).decode('utf-8')
105         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
106
107         # construct API request
108         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
109         # retrieve .json file with links to files
110         request = compat_urllib_request.Request(file_url)
111         try:
112             self.report_download_json(file_url)
113             jsonData = compat_urllib_request.urlopen(request).read()
114         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
115             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
116
117         # parse JSON
118         json_data = json.loads(jsonData)
119         player_url = json_data['player_swf_url']
120         formats = dict(json_data['audio_formats'])
121
122         req_format = self._downloader.params.get('format', None)
123         bitrate = None
124
125         if self._downloader.params.get('listformats', None):
126             self._print_formats(formats)
127             return
128
129         if req_format is None or req_format == 'best':
130             for format_param in formats.keys():
131                 url_list = self.get_urls(formats, format_param)
132                 # check urls
133                 file_url = self.check_urls(url_list)
134                 if file_url is not None:
135                     break # got it!
136         else:
137             if req_format not in formats:
138                 raise ExtractorError(u'Format is not available')
139
140             url_list = self.get_urls(formats, req_format)
141             file_url = self.check_urls(url_list)
142             format_param = req_format
143
144         return [{
145             'id': file_id.decode('utf-8'),
146             'url': file_url.decode('utf-8'),
147             'uploader': uploader.decode('utf-8'),
148             'upload_date': None,
149             'title': json_data['name'],
150             'ext': file_url.split('.')[-1].decode('utf-8'),
151             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
152             'thumbnail': json_data['thumbnail_url'],
153             'description': json_data['description'],
154             'player_url': player_url.decode('utf-8'),
155         }]
156
157
158
159
160 class YoukuIE(InfoExtractor):
161     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
162
163     def _gen_sid(self):
164         nowTime = int(time.time() * 1000)
165         random1 = random.randint(1000,1998)
166         random2 = random.randint(1000,9999)
167
168         return "%d%d%d" %(nowTime,random1,random2)
169
170     def _get_file_ID_mix_string(self, seed):
171         mixed = []
172         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
173         seed = float(seed)
174         for i in range(len(source)):
175             seed  =  (seed * 211 + 30031 ) % 65536
176             index  =  math.floor(seed / 65536 * len(source) )
177             mixed.append(source[int(index)])
178             source.remove(source[int(index)])
179         #return ''.join(mixed)
180         return mixed
181
182     def _get_file_id(self, fileId, seed):
183         mixed = self._get_file_ID_mix_string(seed)
184         ids = fileId.split('*')
185         realId = []
186         for ch in ids:
187             if ch:
188                 realId.append(mixed[int(ch)])
189         return ''.join(realId)
190
191     def _real_extract(self, url):
192         mobj = re.match(self._VALID_URL, url)
193         if mobj is None:
194             raise ExtractorError(u'Invalid URL: %s' % url)
195         video_id = mobj.group('ID')
196
197         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
198
199         jsondata = self._download_webpage(info_url, video_id)
200
201         self.report_extraction(video_id)
202         try:
203             config = json.loads(jsondata)
204
205             video_title =  config['data'][0]['title']
206             seed = config['data'][0]['seed']
207
208             format = self._downloader.params.get('format', None)
209             supported_format = list(config['data'][0]['streamfileids'].keys())
210
211             if format is None or format == 'best':
212                 if 'hd2' in supported_format:
213                     format = 'hd2'
214                 else:
215                     format = 'flv'
216                 ext = u'flv'
217             elif format == 'worst':
218                 format = 'mp4'
219                 ext = u'mp4'
220             else:
221                 format = 'flv'
222                 ext = u'flv'
223
224
225             fileid = config['data'][0]['streamfileids'][format]
226             keys = [s['k'] for s in config['data'][0]['segs'][format]]
227         except (UnicodeDecodeError, ValueError, KeyError):
228             raise ExtractorError(u'Unable to extract info section')
229
230         files_info=[]
231         sid = self._gen_sid()
232         fileid = self._get_file_id(fileid, seed)
233
234         #column 8,9 of fileid represent the segment number
235         #fileid[7:9] should be changed
236         for index, key in enumerate(keys):
237
238             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
239             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
240
241             info = {
242                 'id': '%s_part%02d' % (video_id, index),
243                 'url': download_url,
244                 'uploader': None,
245                 'upload_date': None,
246                 'title': video_title,
247                 'ext': ext,
248             }
249             files_info.append(info)
250
251         return files_info
252
253
254 class XNXXIE(InfoExtractor):
255     """Information extractor for xnxx.com"""
256
257     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
258     IE_NAME = u'xnxx'
259     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
260     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
261     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
262
263     def _real_extract(self, url):
264         mobj = re.match(self._VALID_URL, url)
265         if mobj is None:
266             raise ExtractorError(u'Invalid URL: %s' % url)
267         video_id = mobj.group(1)
268
269         # Get webpage content
270         webpage = self._download_webpage(url, video_id)
271
272         video_url = self._search_regex(self.VIDEO_URL_RE,
273             webpage, u'video URL')
274         video_url = compat_urllib_parse.unquote(video_url)
275
276         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
277             webpage, u'title')
278
279         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
280             webpage, u'thumbnail', fatal=False)
281
282         return [{
283             'id': video_id,
284             'url': video_url,
285             'uploader': None,
286             'upload_date': None,
287             'title': video_title,
288             'ext': 'flv',
289             'thumbnail': video_thumbnail,
290             'description': None,
291         }]
292
293
294
295
296 class JustinTVIE(InfoExtractor):
297     """Information extractor for justin.tv and twitch.tv"""
298     # TODO: One broadcast may be split into multiple videos. The key
299     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
300     # starts at 1 and increases. Can we treat all parts as one video?
301
302     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
303         (?:
304             (?P<channelid>[^/]+)|
305             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
306             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
307         )
308         /?(?:\#.*)?$
309         """
310     _JUSTIN_PAGE_LIMIT = 100
311     IE_NAME = u'justin.tv'
312
313     def report_download_page(self, channel, offset):
314         """Report attempt to download a single page of videos."""
315         self.to_screen(u'%s: Downloading video information from %d to %d' %
316                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
317
318     # Return count of items, list of *valid* items
319     def _parse_page(self, url, video_id):
320         webpage = self._download_webpage(url, video_id,
321                                          u'Downloading video info JSON',
322                                          u'unable to download video info JSON')
323
324         response = json.loads(webpage)
325         if type(response) != list:
326             error_text = response.get('error', 'unknown error')
327             raise ExtractorError(u'Justin.tv API: %s' % error_text)
328         info = []
329         for clip in response:
330             video_url = clip['video_file_url']
331             if video_url:
332                 video_extension = os.path.splitext(video_url)[1][1:]
333                 video_date = re.sub('-', '', clip['start_time'][:10])
334                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
335                 video_id = clip['id']
336                 video_title = clip.get('title', video_id)
337                 info.append({
338                     'id': video_id,
339                     'url': video_url,
340                     'title': video_title,
341                     'uploader': clip.get('channel_name', video_uploader_id),
342                     'uploader_id': video_uploader_id,
343                     'upload_date': video_date,
344                     'ext': video_extension,
345                 })
346         return (len(response), info)
347
348     def _real_extract(self, url):
349         mobj = re.match(self._VALID_URL, url)
350         if mobj is None:
351             raise ExtractorError(u'invalid URL: %s' % url)
352
353         api_base = 'http://api.justin.tv'
354         paged = False
355         if mobj.group('channelid'):
356             paged = True
357             video_id = mobj.group('channelid')
358             api = api_base + '/channel/archives/%s.json' % video_id
359         elif mobj.group('chapterid'):
360             chapter_id = mobj.group('chapterid')
361
362             webpage = self._download_webpage(url, chapter_id)
363             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
364             if not m:
365                 raise ExtractorError(u'Cannot find archive of a chapter')
366             archive_id = m.group(1)
367
368             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
369             chapter_info_xml = self._download_webpage(api, chapter_id,
370                                              note=u'Downloading chapter information',
371                                              errnote=u'Chapter information download failed')
372             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
373             for a in doc.findall('.//archive'):
374                 if archive_id == a.find('./id').text:
375                     break
376             else:
377                 raise ExtractorError(u'Could not find chapter in chapter information')
378
379             video_url = a.find('./video_file_url').text
380             video_ext = video_url.rpartition('.')[2] or u'flv'
381
382             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
383             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
384                                    note='Downloading chapter metadata',
385                                    errnote='Download of chapter metadata failed')
386             chapter_info = json.loads(chapter_info_json)
387
388             bracket_start = int(doc.find('.//bracket_start').text)
389             bracket_end = int(doc.find('.//bracket_end').text)
390
391             # TODO determine start (and probably fix up file)
392             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
393             #video_url += u'?start=' + TODO:start_timestamp
394             # bracket_start is 13290, but we want 51670615
395             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
396                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
397
398             info = {
399                 'id': u'c' + chapter_id,
400                 'url': video_url,
401                 'ext': video_ext,
402                 'title': chapter_info['title'],
403                 'thumbnail': chapter_info['preview'],
404                 'description': chapter_info['description'],
405                 'uploader': chapter_info['channel']['display_name'],
406                 'uploader_id': chapter_info['channel']['name'],
407             }
408             return [info]
409         else:
410             video_id = mobj.group('videoid')
411             api = api_base + '/broadcast/by_archive/%s.json' % video_id
412
413         self.report_extraction(video_id)
414
415         info = []
416         offset = 0
417         limit = self._JUSTIN_PAGE_LIMIT
418         while True:
419             if paged:
420                 self.report_download_page(video_id, offset)
421             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
422             page_count, page_info = self._parse_page(page_url, video_id)
423             info.extend(page_info)
424             if not paged or page_count != limit:
425                 break
426             offset += limit
427         return info
428
429 class FunnyOrDieIE(InfoExtractor):
430     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
431
432     def _real_extract(self, url):
433         mobj = re.match(self._VALID_URL, url)
434         if mobj is None:
435             raise ExtractorError(u'invalid URL: %s' % url)
436
437         video_id = mobj.group('id')
438         webpage = self._download_webpage(url, video_id)
439
440         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
441             webpage, u'video URL', flags=re.DOTALL)
442
443         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
444             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
445
446         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
447             webpage, u'description', fatal=False, flags=re.DOTALL)
448
449         info = {
450             'id': video_id,
451             'url': video_url,
452             'ext': 'mp4',
453             'title': title,
454             'description': video_description,
455         }
456         return [info]
457
458 class SteamIE(InfoExtractor):
459     _VALID_URL = r"""http://store\.steampowered\.com/
460                 (agecheck/)?
461                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
462                 (?P<gameID>\d+)/?
463                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
464                 """
465     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
466     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
467
468     @classmethod
469     def suitable(cls, url):
470         """Receives a URL and returns True if suitable for this IE."""
471         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
472
473     def _real_extract(self, url):
474         m = re.match(self._VALID_URL, url, re.VERBOSE)
475         gameID = m.group('gameID')
476
477         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
478         webpage = self._download_webpage(videourl, gameID)
479
480         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
481             videourl = self._AGECHECK_TEMPLATE % gameID
482             self.report_age_confirmation()
483             webpage = self._download_webpage(videourl, gameID)
484
485         self.report_extraction(gameID)
486         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
487                                              webpage, 'game title')
488
489         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
490         mweb = re.finditer(urlRE, webpage)
491         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
492         titles = re.finditer(namesRE, webpage)
493         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
494         thumbs = re.finditer(thumbsRE, webpage)
495         videos = []
496         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
497             video_id = vid.group('videoID')
498             title = vtitle.group('videoName')
499             video_url = vid.group('videoURL')
500             video_thumb = thumb.group('thumbnail')
501             if not video_url:
502                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
503             info = {
504                 'id':video_id,
505                 'url':video_url,
506                 'ext': 'flv',
507                 'title': unescapeHTML(title),
508                 'thumbnail': video_thumb
509                   }
510             videos.append(info)
511         return [self.playlist_result(videos, gameID, game_title)]
512
513 class UstreamIE(InfoExtractor):
514     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
515     IE_NAME = u'ustream'
516
517     def _real_extract(self, url):
518         m = re.match(self._VALID_URL, url)
519         video_id = m.group('videoID')
520
521         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
522         webpage = self._download_webpage(url, video_id)
523
524         self.report_extraction(video_id)
525
526         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
527             webpage, u'title')
528
529         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
530             webpage, u'uploader', fatal=False, flags=re.DOTALL)
531
532         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
533             webpage, u'thumbnail', fatal=False)
534
535         info = {
536                 'id': video_id,
537                 'url': video_url,
538                 'ext': 'flv',
539                 'title': video_title,
540                 'uploader': uploader,
541                 'thumbnail': thumbnail,
542                }
543         return info
544
545 class WorldStarHipHopIE(InfoExtractor):
546     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
547     IE_NAME = u'WorldStarHipHop'
548
549     def _real_extract(self, url):
550         m = re.match(self._VALID_URL, url)
551         video_id = m.group('id')
552
553         webpage_src = self._download_webpage(url, video_id)
554
555         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
556             webpage_src, u'video URL')
557
558         if 'mp4' in video_url:
559             ext = 'mp4'
560         else:
561             ext = 'flv'
562
563         video_title = self._html_search_regex(r"<title>(.*)</title>",
564             webpage_src, u'title')
565
566         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
567         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
568             webpage_src, u'thumbnail', fatal=False)
569
570         if not thumbnail:
571             _title = r"""candytitles.*>(.*)</span>"""
572             mobj = re.search(_title, webpage_src)
573             if mobj is not None:
574                 video_title = mobj.group(1)
575
576         results = [{
577                     'id': video_id,
578                     'url' : video_url,
579                     'title' : video_title,
580                     'thumbnail' : thumbnail,
581                     'ext' : ext,
582                     }]
583         return results
584
585 class RBMARadioIE(InfoExtractor):
586     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
587
588     def _real_extract(self, url):
589         m = re.match(self._VALID_URL, url)
590         video_id = m.group('videoID')
591
592         webpage = self._download_webpage(url, video_id)
593
594         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
595             webpage, u'json data', flags=re.MULTILINE)
596
597         try:
598             data = json.loads(json_data)
599         except ValueError as e:
600             raise ExtractorError(u'Invalid JSON: ' + str(e))
601
602         video_url = data['akamai_url'] + '&cbr=256'
603         url_parts = compat_urllib_parse_urlparse(video_url)
604         video_ext = url_parts.path.rpartition('.')[2]
605         info = {
606                 'id': video_id,
607                 'url': video_url,
608                 'ext': video_ext,
609                 'title': data['title'],
610                 'description': data.get('teaser_text'),
611                 'location': data.get('country_of_origin'),
612                 'uploader': data.get('host', {}).get('name'),
613                 'uploader_id': data.get('host', {}).get('slug'),
614                 'thumbnail': data.get('image', {}).get('large_url_2x'),
615                 'duration': data.get('duration'),
616         }
617         return [info]
618
619
620 class YouPornIE(InfoExtractor):
621     """Information extractor for youporn.com."""
622     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
623
624     def _print_formats(self, formats):
625         """Print all available formats"""
626         print(u'Available formats:')
627         print(u'ext\t\tformat')
628         print(u'---------------------------------')
629         for format in formats:
630             print(u'%s\t\t%s'  % (format['ext'], format['format']))
631
632     def _specific(self, req_format, formats):
633         for x in formats:
634             if(x["format"]==req_format):
635                 return x
636         return None
637
638     def _real_extract(self, url):
639         mobj = re.match(self._VALID_URL, url)
640         if mobj is None:
641             raise ExtractorError(u'Invalid URL: %s' % url)
642         video_id = mobj.group('videoid')
643
644         req = compat_urllib_request.Request(url)
645         req.add_header('Cookie', 'age_verified=1')
646         webpage = self._download_webpage(req, video_id)
647
648         # Get JSON parameters
649         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
650         try:
651             params = json.loads(json_params)
652         except:
653             raise ExtractorError(u'Invalid JSON')
654
655         self.report_extraction(video_id)
656         try:
657             video_title = params['title']
658             upload_date = unified_strdate(params['release_date_f'])
659             video_description = params['description']
660             video_uploader = params['submitted_by']
661             thumbnail = params['thumbnails'][0]['image']
662         except KeyError:
663             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
664
665         # Get all of the formats available
666         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
667         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
668             webpage, u'download list').strip()
669
670         # Get all of the links from the page
671         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
672         links = re.findall(LINK_RE, download_list_html)
673         if(len(links) == 0):
674             raise ExtractorError(u'ERROR: no known formats available for video')
675
676         self.to_screen(u'Links found: %d' % len(links))
677
678         formats = []
679         for link in links:
680
681             # A link looks like this:
682             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
683             # A path looks like this:
684             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
685             video_url = unescapeHTML( link )
686             path = compat_urllib_parse_urlparse( video_url ).path
687             extension = os.path.splitext( path )[1][1:]
688             format = path.split('/')[4].split('_')[:2]
689             size = format[0]
690             bitrate = format[1]
691             format = "-".join( format )
692             # title = u'%s-%s-%s' % (video_title, size, bitrate)
693
694             formats.append({
695                 'id': video_id,
696                 'url': video_url,
697                 'uploader': video_uploader,
698                 'upload_date': upload_date,
699                 'title': video_title,
700                 'ext': extension,
701                 'format': format,
702                 'thumbnail': thumbnail,
703                 'description': video_description
704             })
705
706         if self._downloader.params.get('listformats', None):
707             self._print_formats(formats)
708             return
709
710         req_format = self._downloader.params.get('format', None)
711         self.to_screen(u'Format: %s' % req_format)
712
713         if req_format is None or req_format == 'best':
714             return [formats[0]]
715         elif req_format == 'worst':
716             return [formats[-1]]
717         elif req_format in ('-1', 'all'):
718             return formats
719         else:
720             format = self._specific( req_format, formats )
721             if result is None:
722                 raise ExtractorError(u'Requested format not available')
723             return [format]
724
725
726
727 class PornotubeIE(InfoExtractor):
728     """Information extractor for pornotube.com."""
729     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
730
731     def _real_extract(self, url):
732         mobj = re.match(self._VALID_URL, url)
733         if mobj is None:
734             raise ExtractorError(u'Invalid URL: %s' % url)
735
736         video_id = mobj.group('videoid')
737         video_title = mobj.group('title')
738
739         # Get webpage content
740         webpage = self._download_webpage(url, video_id)
741
742         # Get the video URL
743         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
744         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
745         video_url = compat_urllib_parse.unquote(video_url)
746
747         #Get the uploaded date
748         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
749         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
750         if upload_date: upload_date = unified_strdate(upload_date)
751
752         info = {'id': video_id,
753                 'url': video_url,
754                 'uploader': None,
755                 'upload_date': upload_date,
756                 'title': video_title,
757                 'ext': 'flv',
758                 'format': 'flv'}
759
760         return [info]
761
762 class YouJizzIE(InfoExtractor):
763     """Information extractor for youjizz.com."""
764     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
765
766     def _real_extract(self, url):
767         mobj = re.match(self._VALID_URL, url)
768         if mobj is None:
769             raise ExtractorError(u'Invalid URL: %s' % url)
770
771         video_id = mobj.group('videoid')
772
773         # Get webpage content
774         webpage = self._download_webpage(url, video_id)
775
776         # Get the video title
777         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
778             webpage, u'title').strip()
779
780         # Get the embed page
781         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
782         if result is None:
783             raise ExtractorError(u'ERROR: unable to extract embed page')
784
785         embed_page_url = result.group(0).strip()
786         video_id = result.group('videoid')
787
788         webpage = self._download_webpage(embed_page_url, video_id)
789
790         # Get the video URL
791         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
792             webpage, u'video URL')
793
794         info = {'id': video_id,
795                 'url': video_url,
796                 'title': video_title,
797                 'ext': 'flv',
798                 'format': 'flv',
799                 'player_url': embed_page_url}
800
801         return [info]
802
803 class EightTracksIE(InfoExtractor):
804     IE_NAME = '8tracks'
805     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
806
807     def _real_extract(self, url):
808         mobj = re.match(self._VALID_URL, url)
809         if mobj is None:
810             raise ExtractorError(u'Invalid URL: %s' % url)
811         playlist_id = mobj.group('id')
812
813         webpage = self._download_webpage(url, playlist_id)
814
815         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
816         data = json.loads(json_like)
817
818         session = str(random.randint(0, 1000000000))
819         mix_id = data['id']
820         track_count = data['tracks_count']
821         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
822         next_url = first_url
823         res = []
824         for i in itertools.count():
825             api_json = self._download_webpage(next_url, playlist_id,
826                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
827                 errnote=u'Failed to download song information')
828             api_data = json.loads(api_json)
829             track_data = api_data[u'set']['track']
830             info = {
831                 'id': track_data['id'],
832                 'url': track_data['track_file_stream_url'],
833                 'title': track_data['performer'] + u' - ' + track_data['name'],
834                 'raw_title': track_data['name'],
835                 'uploader_id': data['user']['login'],
836                 'ext': 'm4a',
837             }
838             res.append(info)
839             if api_data['set']['at_last_track']:
840                 break
841             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
842         return res
843
844 class KeekIE(InfoExtractor):
845     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
846     IE_NAME = u'keek'
847
848     def _real_extract(self, url):
849         m = re.match(self._VALID_URL, url)
850         video_id = m.group('videoID')
851
852         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
853         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
854         webpage = self._download_webpage(url, video_id)
855
856         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
857             webpage, u'title')
858
859         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
860             webpage, u'uploader', fatal=False)
861
862         info = {
863                 'id': video_id,
864                 'url': video_url,
865                 'ext': 'mp4',
866                 'title': video_title,
867                 'thumbnail': thumbnail,
868                 'uploader': uploader
869         }
870         return [info]
871
872
873 class MySpassIE(InfoExtractor):
874     _VALID_URL = r'http://www.myspass.de/.*'
875
876     def _real_extract(self, url):
877         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
878
879         # video id is the last path element of the URL
880         # usually there is a trailing slash, so also try the second but last
881         url_path = compat_urllib_parse_urlparse(url).path
882         url_parent_path, video_id = os.path.split(url_path)
883         if not video_id:
884             _, video_id = os.path.split(url_parent_path)
885
886         # get metadata
887         metadata_url = META_DATA_URL_TEMPLATE % video_id
888         metadata_text = self._download_webpage(metadata_url, video_id)
889         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
890
891         # extract values from metadata
892         url_flv_el = metadata.find('url_flv')
893         if url_flv_el is None:
894             raise ExtractorError(u'Unable to extract download url')
895         video_url = url_flv_el.text
896         extension = os.path.splitext(video_url)[1][1:]
897         title_el = metadata.find('title')
898         if title_el is None:
899             raise ExtractorError(u'Unable to extract title')
900         title = title_el.text
901         format_id_el = metadata.find('format_id')
902         if format_id_el is None:
903             format = ext
904         else:
905             format = format_id_el.text
906         description_el = metadata.find('description')
907         if description_el is not None:
908             description = description_el.text
909         else:
910             description = None
911         imagePreview_el = metadata.find('imagePreview')
912         if imagePreview_el is not None:
913             thumbnail = imagePreview_el.text
914         else:
915             thumbnail = None
916         info = {
917             'id': video_id,
918             'url': video_url,
919             'title': title,
920             'ext': extension,
921             'format': format,
922             'thumbnail': thumbnail,
923             'description': description
924         }
925         return [info]
926
927 class SpiegelIE(InfoExtractor):
928     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
929
930     def _real_extract(self, url):
931         m = re.match(self._VALID_URL, url)
932         video_id = m.group('videoID')
933
934         webpage = self._download_webpage(url, video_id)
935
936         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
937             webpage, u'title')
938
939         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
940         xml_code = self._download_webpage(xml_url, video_id,
941                     note=u'Downloading XML', errnote=u'Failed to download XML')
942
943         idoc = xml.etree.ElementTree.fromstring(xml_code)
944         last_type = idoc[-1]
945         filename = last_type.findall('./filename')[0].text
946         duration = float(last_type.findall('./duration')[0].text)
947
948         video_url = 'http://video2.spiegel.de/flash/' + filename
949         video_ext = filename.rpartition('.')[2]
950         info = {
951             'id': video_id,
952             'url': video_url,
953             'ext': video_ext,
954             'title': video_title,
955             'duration': duration,
956         }
957         return [info]
958
959 class LiveLeakIE(InfoExtractor):
960
961     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
962     IE_NAME = u'liveleak'
963
964     def _real_extract(self, url):
965         mobj = re.match(self._VALID_URL, url)
966         if mobj is None:
967             raise ExtractorError(u'Invalid URL: %s' % url)
968
969         video_id = mobj.group('video_id')
970
971         webpage = self._download_webpage(url, video_id)
972
973         video_url = self._search_regex(r'file: "(.*?)",',
974             webpage, u'video URL')
975
976         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
977             webpage, u'title').replace('LiveLeak.com -', '').strip()
978
979         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
980             webpage, u'description', fatal=False)
981
982         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
983             webpage, u'uploader', fatal=False)
984
985         info = {
986             'id':  video_id,
987             'url': video_url,
988             'ext': 'mp4',
989             'title': video_title,
990             'description': video_description,
991             'uploader': video_uploader
992         }
993
994         return [info]
995
996
997
998 class TumblrIE(InfoExtractor):
999     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1000
1001     def _real_extract(self, url):
1002         m_url = re.match(self._VALID_URL, url)
1003         video_id = m_url.group('id')
1004         blog = m_url.group('blog_name')
1005
1006         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1007         webpage = self._download_webpage(url, video_id)
1008
1009         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1010         video = re.search(re_video, webpage)
1011         if video is None:
1012            raise ExtractorError(u'Unable to extract video')
1013         video_url = video.group('video_url')
1014         ext = video.group('ext')
1015
1016         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1017             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1018         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1019
1020         # The only place where you can get a title, it's not complete,
1021         # but searching in other places doesn't work for all videos
1022         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1023             webpage, u'title', flags=re.DOTALL)
1024
1025         return [{'id': video_id,
1026                  'url': video_url,
1027                  'title': video_title,
1028                  'thumbnail': video_thumbnail,
1029                  'ext': ext
1030                  }]
1031
1032 class BandcampIE(InfoExtractor):
1033     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1034
1035     def _real_extract(self, url):
1036         mobj = re.match(self._VALID_URL, url)
1037         title = mobj.group('title')
1038         webpage = self._download_webpage(url, title)
1039         # We get the link to the free download page
1040         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1041         if m_download is None:
1042             raise ExtractorError(u'No free songs found')
1043
1044         download_link = m_download.group(1)
1045         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
1046                        webpage, re.MULTILINE|re.DOTALL).group('id')
1047
1048         download_webpage = self._download_webpage(download_link, id,
1049                                                   'Downloading free downloads page')
1050         # We get the dictionary of the track from some javascrip code
1051         info = re.search(r'items: (.*?),$',
1052                          download_webpage, re.MULTILINE).group(1)
1053         info = json.loads(info)[0]
1054         # We pick mp3-320 for now, until format selection can be easily implemented.
1055         mp3_info = info[u'downloads'][u'mp3-320']
1056         # If we try to use this url it says the link has expired
1057         initial_url = mp3_info[u'url']
1058         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1059         m_url = re.match(re_url, initial_url)
1060         #We build the url we will use to get the final track url
1061         # This url is build in Bandcamp in the script download_bunde_*.js
1062         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1063         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1064         # If we could correctly generate the .rand field the url would be
1065         #in the "download_url" key
1066         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1067
1068         track_info = {'id':id,
1069                       'title' : info[u'title'],
1070                       'ext' :   'mp3',
1071                       'url' :   final_url,
1072                       'thumbnail' : info[u'thumb_url'],
1073                       'uploader' :  info[u'artist']
1074                       }
1075
1076         return [track_info]
1077
1078 class RedTubeIE(InfoExtractor):
1079     """Information Extractor for redtube"""
1080     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1081
1082     def _real_extract(self,url):
1083         mobj = re.match(self._VALID_URL, url)
1084         if mobj is None:
1085             raise ExtractorError(u'Invalid URL: %s' % url)
1086
1087         video_id = mobj.group('id')
1088         video_extension = 'mp4'        
1089         webpage = self._download_webpage(url, video_id)
1090
1091         self.report_extraction(video_id)
1092
1093         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1094             webpage, u'video URL')
1095
1096         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1097             webpage, u'title')
1098
1099         return [{
1100             'id':       video_id,
1101             'url':      video_url,
1102             'ext':      video_extension,
1103             'title':    video_title,
1104         }]
1105         
1106 class InaIE(InfoExtractor):
1107     """Information Extractor for Ina.fr"""
1108     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1109
1110     def _real_extract(self,url):
1111         mobj = re.match(self._VALID_URL, url)
1112
1113         video_id = mobj.group('id')
1114         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1115         video_extension = 'mp4'
1116         webpage = self._download_webpage(mrss_url, video_id)
1117
1118         self.report_extraction(video_id)
1119
1120         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1121             webpage, u'video URL')
1122
1123         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1124             webpage, u'title')
1125
1126         return [{
1127             'id':       video_id,
1128             'url':      video_url,
1129             'ext':      video_extension,
1130             'title':    video_title,
1131         }]
1132
1133 class HowcastIE(InfoExtractor):
1134     """Information Extractor for Howcast.com"""
1135     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1136
1137     def _real_extract(self, url):
1138         mobj = re.match(self._VALID_URL, url)
1139
1140         video_id = mobj.group('id')
1141         webpage_url = 'http://www.howcast.com/videos/' + video_id
1142         webpage = self._download_webpage(webpage_url, video_id)
1143
1144         self.report_extraction(video_id)
1145
1146         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1147             webpage, u'video URL')
1148
1149         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1150             webpage, u'title')
1151
1152         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1153             webpage, u'description', fatal=False)
1154
1155         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1156             webpage, u'thumbnail', fatal=False)
1157
1158         return [{
1159             'id':       video_id,
1160             'url':      video_url,
1161             'ext':      'mp4',
1162             'title':    video_title,
1163             'description': video_description,
1164             'thumbnail': thumbnail,
1165         }]
1166
1167 class VineIE(InfoExtractor):
1168     """Information Extractor for Vine.co"""
1169     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1170
1171     def _real_extract(self, url):
1172         mobj = re.match(self._VALID_URL, url)
1173
1174         video_id = mobj.group('id')
1175         webpage_url = 'https://vine.co/v/' + video_id
1176         webpage = self._download_webpage(webpage_url, video_id)
1177
1178         self.report_extraction(video_id)
1179
1180         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1181             webpage, u'video URL')
1182
1183         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1184             webpage, u'title')
1185
1186         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1187             webpage, u'thumbnail', fatal=False)
1188
1189         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1190             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1191
1192         return [{
1193             'id':        video_id,
1194             'url':       video_url,
1195             'ext':       'mp4',
1196             'title':     video_title,
1197             'thumbnail': thumbnail,
1198             'uploader':  uploader,
1199         }]
1200
1201 class FlickrIE(InfoExtractor):
1202     """Information Extractor for Flickr videos"""
1203     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1204
1205     def _real_extract(self, url):
1206         mobj = re.match(self._VALID_URL, url)
1207
1208         video_id = mobj.group('id')
1209         video_uploader_id = mobj.group('uploader_id')
1210         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1211         webpage = self._download_webpage(webpage_url, video_id)
1212
1213         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1214
1215         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1216         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1217
1218         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1219             first_xml, u'node_id')
1220
1221         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1222         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1223
1224         self.report_extraction(video_id)
1225
1226         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1227         if mobj is None:
1228             raise ExtractorError(u'Unable to extract video url')
1229         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1230
1231         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1232             webpage, u'video title')
1233
1234         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1235             webpage, u'description', fatal=False)
1236
1237         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1238             webpage, u'thumbnail', fatal=False)
1239
1240         return [{
1241             'id':          video_id,
1242             'url':         video_url,
1243             'ext':         'mp4',
1244             'title':       video_title,
1245             'description': video_description,
1246             'thumbnail':   thumbnail,
1247             'uploader_id': video_uploader_id,
1248         }]
1249
1250 class TeamcocoIE(InfoExtractor):
1251     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1252
1253     def _real_extract(self, url):
1254         mobj = re.match(self._VALID_URL, url)
1255         if mobj is None:
1256             raise ExtractorError(u'Invalid URL: %s' % url)
1257         url_title = mobj.group('url_title')
1258         webpage = self._download_webpage(url, url_title)
1259
1260         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1261             webpage, u'video id')
1262
1263         self.report_extraction(video_id)
1264
1265         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1266             webpage, u'title')
1267
1268         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1269             webpage, u'thumbnail', fatal=False)
1270
1271         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1272             webpage, u'description', fatal=False)
1273
1274         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1275         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1276
1277         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1278             data, u'video URL')
1279
1280         return [{
1281             'id':          video_id,
1282             'url':         video_url,
1283             'ext':         'mp4',
1284             'title':       video_title,
1285             'thumbnail':   thumbnail,
1286             'description': video_description,
1287         }]
1288
1289 class XHamsterIE(InfoExtractor):
1290     """Information Extractor for xHamster"""
1291     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1292
1293     def _real_extract(self,url):
1294         mobj = re.match(self._VALID_URL, url)
1295
1296         video_id = mobj.group('id')
1297         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1298         webpage = self._download_webpage(mrss_url, video_id)
1299
1300         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1301         if mobj is None:
1302             raise ExtractorError(u'Unable to extract media URL')
1303         if len(mobj.group('server')) == 0:
1304             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1305         else:
1306             video_url = mobj.group('server')+'/key='+mobj.group('file')
1307         video_extension = video_url.split('.')[-1]
1308
1309         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1310             webpage, u'title')
1311
1312         # Can't see the description anywhere in the UI
1313         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1314         #     webpage, u'description', fatal=False)
1315         # if video_description: video_description = unescapeHTML(video_description)
1316
1317         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1318         if mobj:
1319             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1320         else:
1321             video_upload_date = None
1322             self._downloader.report_warning(u'Unable to extract upload date')
1323
1324         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1325             webpage, u'uploader id', default=u'anonymous')
1326
1327         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1328             webpage, u'thumbnail', fatal=False)
1329
1330         return [{
1331             'id':       video_id,
1332             'url':      video_url,
1333             'ext':      video_extension,
1334             'title':    video_title,
1335             # 'description': video_description,
1336             'upload_date': video_upload_date,
1337             'uploader_id': video_uploader_id,
1338             'thumbnail': video_thumbnail
1339         }]
1340
1341 class HypemIE(InfoExtractor):
1342     """Information Extractor for hypem"""
1343     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1344
1345     def _real_extract(self, url):
1346         mobj = re.match(self._VALID_URL, url)
1347         if mobj is None:
1348             raise ExtractorError(u'Invalid URL: %s' % url)
1349         track_id = mobj.group(1)
1350
1351         data = { 'ax': 1, 'ts': time.time() }
1352         data_encoded = compat_urllib_parse.urlencode(data)
1353         complete_url = url + "?" + data_encoded
1354         request = compat_urllib_request.Request(complete_url)
1355         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1356         cookie = urlh.headers.get('Set-Cookie', '')
1357
1358         self.report_extraction(track_id)
1359
1360         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1361             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1362         try:
1363             track_list = json.loads(html_tracks)
1364             track = track_list[u'tracks'][0]
1365         except ValueError:
1366             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1367
1368         key = track[u"key"]
1369         track_id = track[u"id"]
1370         artist = track[u"artist"]
1371         title = track[u"song"]
1372
1373         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1374         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1375         request.add_header('cookie', cookie)
1376         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1377         try:
1378             song_data = json.loads(song_data_json)
1379         except ValueError:
1380             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1381         final_url = song_data[u"url"]
1382
1383         return [{
1384             'id':       track_id,
1385             'url':      final_url,
1386             'ext':      "mp3",
1387             'title':    title,
1388             'artist':   artist,
1389         }]
1390
1391 class Vbox7IE(InfoExtractor):
1392     """Information Extractor for Vbox7"""
1393     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1394
1395     def _real_extract(self,url):
1396         mobj = re.match(self._VALID_URL, url)
1397         if mobj is None:
1398             raise ExtractorError(u'Invalid URL: %s' % url)
1399         video_id = mobj.group(1)
1400
1401         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1402         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1403         redirect_url = urlh.geturl() + new_location
1404         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1405
1406         title = self._html_search_regex(r'<title>(.*)</title>',
1407             webpage, u'title').split('/')[0].strip()
1408
1409         ext = "flv"
1410         info_url = "http://vbox7.com/play/magare.do"
1411         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1412         info_request = compat_urllib_request.Request(info_url, data)
1413         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1414         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1415         if info_response is None:
1416             raise ExtractorError(u'Unable to extract the media url')
1417         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1418
1419         return [{
1420             'id':        video_id,
1421             'url':       final_url,
1422             'ext':       ext,
1423             'title':     title,
1424             'thumbnail': thumbnail_url,
1425         }]
1426
1427
1428 def gen_extractors():
1429     """ Return a list of an instance of every supported extractor.
1430     The order does matter; the first extractor matched is the one handling the URL.
1431     """
1432     return [
1433         YoutubePlaylistIE(),
1434         YoutubeChannelIE(),
1435         YoutubeUserIE(),
1436         YoutubeSearchIE(),
1437         YoutubeIE(),
1438         MetacafeIE(),
1439         DailymotionIE(),
1440         GoogleSearchIE(),
1441         PhotobucketIE(),
1442         YahooIE(),
1443         YahooSearchIE(),
1444         DepositFilesIE(),
1445         FacebookIE(),
1446         BlipTVIE(),
1447         BlipTVUserIE(),
1448         VimeoIE(),
1449         MyVideoIE(),
1450         ComedyCentralIE(),
1451         EscapistIE(),
1452         CollegeHumorIE(),
1453         XVideosIE(),
1454         SoundcloudSetIE(),
1455         SoundcloudIE(),
1456         InfoQIE(),
1457         MixcloudIE(),
1458         StanfordOpenClassroomIE(),
1459         MTVIE(),
1460         YoukuIE(),
1461         XNXXIE(),
1462         YouJizzIE(),
1463         PornotubeIE(),
1464         YouPornIE(),
1465         GooglePlusIE(),
1466         ArteTvIE(),
1467         NBAIE(),
1468         WorldStarHipHopIE(),
1469         JustinTVIE(),
1470         FunnyOrDieIE(),
1471         SteamIE(),
1472         UstreamIE(),
1473         RBMARadioIE(),
1474         EightTracksIE(),
1475         KeekIE(),
1476         TEDIE(),
1477         MySpassIE(),
1478         SpiegelIE(),
1479         LiveLeakIE(),
1480         ARDIE(),
1481         ZDFIE(),
1482         TumblrIE(),
1483         BandcampIE(),
1484         RedTubeIE(),
1485         InaIE(),
1486         HowcastIE(),
1487         VineIE(),
1488         FlickrIE(),
1489         TeamcocoIE(),
1490         XHamsterIE(),
1491         HypemIE(),
1492         Vbox7IE(),
1493         GametrailersIE(),
1494         StatigramIE(),
1495         GenericIE()
1496     ]
1497
1498 def get_info_extractor(ie_name):
1499     """Returns the info extractor class with the given ie_name"""
1500     return globals()[ie_name+'IE']