Move Steam IE into its own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.mixcloud import MixcloudIE
37 from .extractor.mtv import MTVIE
38 from .extractor.myvideo import MyVideoIE
39 from .extractor.nba import NBAIE
40 from .extractor.statigram import StatigramIE
41 from .extractor.photobucket import PhotobucketIE
42 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
43 from .extractor.stanfordoc import StanfordOpenClassroomIE
44 from .extractor.steam import SteamIE
45 from .extractor.ted import TEDIE
46 from .extractor.vimeo import VimeoIE
47 from .extractor.xnxx import XNXXIE
48 from .extractor.xvideos import XVideosIE
49 from .extractor.yahoo import YahooIE, YahooSearchIE
50 from .extractor.youku import YoukuIE
51 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
52 from .extractor.zdf import ZDFIE
53
54
55
56
57
58
59
60
61
62
63
64
65
66 class JustinTVIE(InfoExtractor):
67     """Information extractor for justin.tv and twitch.tv"""
68     # TODO: One broadcast may be split into multiple videos. The key
69     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
70     # starts at 1 and increases. Can we treat all parts as one video?
71
72     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
73         (?:
74             (?P<channelid>[^/]+)|
75             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
76             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
77         )
78         /?(?:\#.*)?$
79         """
80     _JUSTIN_PAGE_LIMIT = 100
81     IE_NAME = u'justin.tv'
82
83     def report_download_page(self, channel, offset):
84         """Report attempt to download a single page of videos."""
85         self.to_screen(u'%s: Downloading video information from %d to %d' %
86                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
87
88     # Return count of items, list of *valid* items
89     def _parse_page(self, url, video_id):
90         webpage = self._download_webpage(url, video_id,
91                                          u'Downloading video info JSON',
92                                          u'unable to download video info JSON')
93
94         response = json.loads(webpage)
95         if type(response) != list:
96             error_text = response.get('error', 'unknown error')
97             raise ExtractorError(u'Justin.tv API: %s' % error_text)
98         info = []
99         for clip in response:
100             video_url = clip['video_file_url']
101             if video_url:
102                 video_extension = os.path.splitext(video_url)[1][1:]
103                 video_date = re.sub('-', '', clip['start_time'][:10])
104                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
105                 video_id = clip['id']
106                 video_title = clip.get('title', video_id)
107                 info.append({
108                     'id': video_id,
109                     'url': video_url,
110                     'title': video_title,
111                     'uploader': clip.get('channel_name', video_uploader_id),
112                     'uploader_id': video_uploader_id,
113                     'upload_date': video_date,
114                     'ext': video_extension,
115                 })
116         return (len(response), info)
117
118     def _real_extract(self, url):
119         mobj = re.match(self._VALID_URL, url)
120         if mobj is None:
121             raise ExtractorError(u'invalid URL: %s' % url)
122
123         api_base = 'http://api.justin.tv'
124         paged = False
125         if mobj.group('channelid'):
126             paged = True
127             video_id = mobj.group('channelid')
128             api = api_base + '/channel/archives/%s.json' % video_id
129         elif mobj.group('chapterid'):
130             chapter_id = mobj.group('chapterid')
131
132             webpage = self._download_webpage(url, chapter_id)
133             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
134             if not m:
135                 raise ExtractorError(u'Cannot find archive of a chapter')
136             archive_id = m.group(1)
137
138             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
139             chapter_info_xml = self._download_webpage(api, chapter_id,
140                                              note=u'Downloading chapter information',
141                                              errnote=u'Chapter information download failed')
142             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
143             for a in doc.findall('.//archive'):
144                 if archive_id == a.find('./id').text:
145                     break
146             else:
147                 raise ExtractorError(u'Could not find chapter in chapter information')
148
149             video_url = a.find('./video_file_url').text
150             video_ext = video_url.rpartition('.')[2] or u'flv'
151
152             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
153             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
154                                    note='Downloading chapter metadata',
155                                    errnote='Download of chapter metadata failed')
156             chapter_info = json.loads(chapter_info_json)
157
158             bracket_start = int(doc.find('.//bracket_start').text)
159             bracket_end = int(doc.find('.//bracket_end').text)
160
161             # TODO determine start (and probably fix up file)
162             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
163             #video_url += u'?start=' + TODO:start_timestamp
164             # bracket_start is 13290, but we want 51670615
165             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
166                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
167
168             info = {
169                 'id': u'c' + chapter_id,
170                 'url': video_url,
171                 'ext': video_ext,
172                 'title': chapter_info['title'],
173                 'thumbnail': chapter_info['preview'],
174                 'description': chapter_info['description'],
175                 'uploader': chapter_info['channel']['display_name'],
176                 'uploader_id': chapter_info['channel']['name'],
177             }
178             return [info]
179         else:
180             video_id = mobj.group('videoid')
181             api = api_base + '/broadcast/by_archive/%s.json' % video_id
182
183         self.report_extraction(video_id)
184
185         info = []
186         offset = 0
187         limit = self._JUSTIN_PAGE_LIMIT
188         while True:
189             if paged:
190                 self.report_download_page(video_id, offset)
191             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
192             page_count, page_info = self._parse_page(page_url, video_id)
193             info.extend(page_info)
194             if not paged or page_count != limit:
195                 break
196             offset += limit
197         return info
198
199 class FunnyOrDieIE(InfoExtractor):
200     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
201
202     def _real_extract(self, url):
203         mobj = re.match(self._VALID_URL, url)
204         if mobj is None:
205             raise ExtractorError(u'invalid URL: %s' % url)
206
207         video_id = mobj.group('id')
208         webpage = self._download_webpage(url, video_id)
209
210         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
211             webpage, u'video URL', flags=re.DOTALL)
212
213         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
214             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
215
216         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
217             webpage, u'description', fatal=False, flags=re.DOTALL)
218
219         info = {
220             'id': video_id,
221             'url': video_url,
222             'ext': 'mp4',
223             'title': title,
224             'description': video_description,
225         }
226         return [info]
227
228
229 class UstreamIE(InfoExtractor):
230     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
231     IE_NAME = u'ustream'
232
233     def _real_extract(self, url):
234         m = re.match(self._VALID_URL, url)
235         video_id = m.group('videoID')
236
237         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
238         webpage = self._download_webpage(url, video_id)
239
240         self.report_extraction(video_id)
241
242         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
243             webpage, u'title')
244
245         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
246             webpage, u'uploader', fatal=False, flags=re.DOTALL)
247
248         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
249             webpage, u'thumbnail', fatal=False)
250
251         info = {
252                 'id': video_id,
253                 'url': video_url,
254                 'ext': 'flv',
255                 'title': video_title,
256                 'uploader': uploader,
257                 'thumbnail': thumbnail,
258                }
259         return info
260
261 class WorldStarHipHopIE(InfoExtractor):
262     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
263     IE_NAME = u'WorldStarHipHop'
264
265     def _real_extract(self, url):
266         m = re.match(self._VALID_URL, url)
267         video_id = m.group('id')
268
269         webpage_src = self._download_webpage(url, video_id)
270
271         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
272             webpage_src, u'video URL')
273
274         if 'mp4' in video_url:
275             ext = 'mp4'
276         else:
277             ext = 'flv'
278
279         video_title = self._html_search_regex(r"<title>(.*)</title>",
280             webpage_src, u'title')
281
282         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
283         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
284             webpage_src, u'thumbnail', fatal=False)
285
286         if not thumbnail:
287             _title = r"""candytitles.*>(.*)</span>"""
288             mobj = re.search(_title, webpage_src)
289             if mobj is not None:
290                 video_title = mobj.group(1)
291
292         results = [{
293                     'id': video_id,
294                     'url' : video_url,
295                     'title' : video_title,
296                     'thumbnail' : thumbnail,
297                     'ext' : ext,
298                     }]
299         return results
300
301 class RBMARadioIE(InfoExtractor):
302     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
303
304     def _real_extract(self, url):
305         m = re.match(self._VALID_URL, url)
306         video_id = m.group('videoID')
307
308         webpage = self._download_webpage(url, video_id)
309
310         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
311             webpage, u'json data', flags=re.MULTILINE)
312
313         try:
314             data = json.loads(json_data)
315         except ValueError as e:
316             raise ExtractorError(u'Invalid JSON: ' + str(e))
317
318         video_url = data['akamai_url'] + '&cbr=256'
319         url_parts = compat_urllib_parse_urlparse(video_url)
320         video_ext = url_parts.path.rpartition('.')[2]
321         info = {
322                 'id': video_id,
323                 'url': video_url,
324                 'ext': video_ext,
325                 'title': data['title'],
326                 'description': data.get('teaser_text'),
327                 'location': data.get('country_of_origin'),
328                 'uploader': data.get('host', {}).get('name'),
329                 'uploader_id': data.get('host', {}).get('slug'),
330                 'thumbnail': data.get('image', {}).get('large_url_2x'),
331                 'duration': data.get('duration'),
332         }
333         return [info]
334
335
336 class YouPornIE(InfoExtractor):
337     """Information extractor for youporn.com."""
338     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
339
340     def _print_formats(self, formats):
341         """Print all available formats"""
342         print(u'Available formats:')
343         print(u'ext\t\tformat')
344         print(u'---------------------------------')
345         for format in formats:
346             print(u'%s\t\t%s'  % (format['ext'], format['format']))
347
348     def _specific(self, req_format, formats):
349         for x in formats:
350             if(x["format"]==req_format):
351                 return x
352         return None
353
354     def _real_extract(self, url):
355         mobj = re.match(self._VALID_URL, url)
356         if mobj is None:
357             raise ExtractorError(u'Invalid URL: %s' % url)
358         video_id = mobj.group('videoid')
359
360         req = compat_urllib_request.Request(url)
361         req.add_header('Cookie', 'age_verified=1')
362         webpage = self._download_webpage(req, video_id)
363
364         # Get JSON parameters
365         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
366         try:
367             params = json.loads(json_params)
368         except:
369             raise ExtractorError(u'Invalid JSON')
370
371         self.report_extraction(video_id)
372         try:
373             video_title = params['title']
374             upload_date = unified_strdate(params['release_date_f'])
375             video_description = params['description']
376             video_uploader = params['submitted_by']
377             thumbnail = params['thumbnails'][0]['image']
378         except KeyError:
379             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
380
381         # Get all of the formats available
382         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
383         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
384             webpage, u'download list').strip()
385
386         # Get all of the links from the page
387         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
388         links = re.findall(LINK_RE, download_list_html)
389         if(len(links) == 0):
390             raise ExtractorError(u'ERROR: no known formats available for video')
391
392         self.to_screen(u'Links found: %d' % len(links))
393
394         formats = []
395         for link in links:
396
397             # A link looks like this:
398             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
399             # A path looks like this:
400             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
401             video_url = unescapeHTML( link )
402             path = compat_urllib_parse_urlparse( video_url ).path
403             extension = os.path.splitext( path )[1][1:]
404             format = path.split('/')[4].split('_')[:2]
405             size = format[0]
406             bitrate = format[1]
407             format = "-".join( format )
408             # title = u'%s-%s-%s' % (video_title, size, bitrate)
409
410             formats.append({
411                 'id': video_id,
412                 'url': video_url,
413                 'uploader': video_uploader,
414                 'upload_date': upload_date,
415                 'title': video_title,
416                 'ext': extension,
417                 'format': format,
418                 'thumbnail': thumbnail,
419                 'description': video_description
420             })
421
422         if self._downloader.params.get('listformats', None):
423             self._print_formats(formats)
424             return
425
426         req_format = self._downloader.params.get('format', None)
427         self.to_screen(u'Format: %s' % req_format)
428
429         if req_format is None or req_format == 'best':
430             return [formats[0]]
431         elif req_format == 'worst':
432             return [formats[-1]]
433         elif req_format in ('-1', 'all'):
434             return formats
435         else:
436             format = self._specific( req_format, formats )
437             if result is None:
438                 raise ExtractorError(u'Requested format not available')
439             return [format]
440
441
442
443 class PornotubeIE(InfoExtractor):
444     """Information extractor for pornotube.com."""
445     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
446
447     def _real_extract(self, url):
448         mobj = re.match(self._VALID_URL, url)
449         if mobj is None:
450             raise ExtractorError(u'Invalid URL: %s' % url)
451
452         video_id = mobj.group('videoid')
453         video_title = mobj.group('title')
454
455         # Get webpage content
456         webpage = self._download_webpage(url, video_id)
457
458         # Get the video URL
459         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
460         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
461         video_url = compat_urllib_parse.unquote(video_url)
462
463         #Get the uploaded date
464         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
465         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
466         if upload_date: upload_date = unified_strdate(upload_date)
467
468         info = {'id': video_id,
469                 'url': video_url,
470                 'uploader': None,
471                 'upload_date': upload_date,
472                 'title': video_title,
473                 'ext': 'flv',
474                 'format': 'flv'}
475
476         return [info]
477
478 class YouJizzIE(InfoExtractor):
479     """Information extractor for youjizz.com."""
480     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
481
482     def _real_extract(self, url):
483         mobj = re.match(self._VALID_URL, url)
484         if mobj is None:
485             raise ExtractorError(u'Invalid URL: %s' % url)
486
487         video_id = mobj.group('videoid')
488
489         # Get webpage content
490         webpage = self._download_webpage(url, video_id)
491
492         # Get the video title
493         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
494             webpage, u'title').strip()
495
496         # Get the embed page
497         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
498         if result is None:
499             raise ExtractorError(u'ERROR: unable to extract embed page')
500
501         embed_page_url = result.group(0).strip()
502         video_id = result.group('videoid')
503
504         webpage = self._download_webpage(embed_page_url, video_id)
505
506         # Get the video URL
507         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
508             webpage, u'video URL')
509
510         info = {'id': video_id,
511                 'url': video_url,
512                 'title': video_title,
513                 'ext': 'flv',
514                 'format': 'flv',
515                 'player_url': embed_page_url}
516
517         return [info]
518
519 class EightTracksIE(InfoExtractor):
520     IE_NAME = '8tracks'
521     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
522
523     def _real_extract(self, url):
524         mobj = re.match(self._VALID_URL, url)
525         if mobj is None:
526             raise ExtractorError(u'Invalid URL: %s' % url)
527         playlist_id = mobj.group('id')
528
529         webpage = self._download_webpage(url, playlist_id)
530
531         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
532         data = json.loads(json_like)
533
534         session = str(random.randint(0, 1000000000))
535         mix_id = data['id']
536         track_count = data['tracks_count']
537         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
538         next_url = first_url
539         res = []
540         for i in itertools.count():
541             api_json = self._download_webpage(next_url, playlist_id,
542                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
543                 errnote=u'Failed to download song information')
544             api_data = json.loads(api_json)
545             track_data = api_data[u'set']['track']
546             info = {
547                 'id': track_data['id'],
548                 'url': track_data['track_file_stream_url'],
549                 'title': track_data['performer'] + u' - ' + track_data['name'],
550                 'raw_title': track_data['name'],
551                 'uploader_id': data['user']['login'],
552                 'ext': 'm4a',
553             }
554             res.append(info)
555             if api_data['set']['at_last_track']:
556                 break
557             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
558         return res
559
560 class KeekIE(InfoExtractor):
561     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
562     IE_NAME = u'keek'
563
564     def _real_extract(self, url):
565         m = re.match(self._VALID_URL, url)
566         video_id = m.group('videoID')
567
568         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
569         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
570         webpage = self._download_webpage(url, video_id)
571
572         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
573             webpage, u'title')
574
575         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
576             webpage, u'uploader', fatal=False)
577
578         info = {
579                 'id': video_id,
580                 'url': video_url,
581                 'ext': 'mp4',
582                 'title': video_title,
583                 'thumbnail': thumbnail,
584                 'uploader': uploader
585         }
586         return [info]
587
588
589 class MySpassIE(InfoExtractor):
590     _VALID_URL = r'http://www.myspass.de/.*'
591
592     def _real_extract(self, url):
593         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
594
595         # video id is the last path element of the URL
596         # usually there is a trailing slash, so also try the second but last
597         url_path = compat_urllib_parse_urlparse(url).path
598         url_parent_path, video_id = os.path.split(url_path)
599         if not video_id:
600             _, video_id = os.path.split(url_parent_path)
601
602         # get metadata
603         metadata_url = META_DATA_URL_TEMPLATE % video_id
604         metadata_text = self._download_webpage(metadata_url, video_id)
605         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
606
607         # extract values from metadata
608         url_flv_el = metadata.find('url_flv')
609         if url_flv_el is None:
610             raise ExtractorError(u'Unable to extract download url')
611         video_url = url_flv_el.text
612         extension = os.path.splitext(video_url)[1][1:]
613         title_el = metadata.find('title')
614         if title_el is None:
615             raise ExtractorError(u'Unable to extract title')
616         title = title_el.text
617         format_id_el = metadata.find('format_id')
618         if format_id_el is None:
619             format = ext
620         else:
621             format = format_id_el.text
622         description_el = metadata.find('description')
623         if description_el is not None:
624             description = description_el.text
625         else:
626             description = None
627         imagePreview_el = metadata.find('imagePreview')
628         if imagePreview_el is not None:
629             thumbnail = imagePreview_el.text
630         else:
631             thumbnail = None
632         info = {
633             'id': video_id,
634             'url': video_url,
635             'title': title,
636             'ext': extension,
637             'format': format,
638             'thumbnail': thumbnail,
639             'description': description
640         }
641         return [info]
642
643 class SpiegelIE(InfoExtractor):
644     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
645
646     def _real_extract(self, url):
647         m = re.match(self._VALID_URL, url)
648         video_id = m.group('videoID')
649
650         webpage = self._download_webpage(url, video_id)
651
652         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
653             webpage, u'title')
654
655         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
656         xml_code = self._download_webpage(xml_url, video_id,
657                     note=u'Downloading XML', errnote=u'Failed to download XML')
658
659         idoc = xml.etree.ElementTree.fromstring(xml_code)
660         last_type = idoc[-1]
661         filename = last_type.findall('./filename')[0].text
662         duration = float(last_type.findall('./duration')[0].text)
663
664         video_url = 'http://video2.spiegel.de/flash/' + filename
665         video_ext = filename.rpartition('.')[2]
666         info = {
667             'id': video_id,
668             'url': video_url,
669             'ext': video_ext,
670             'title': video_title,
671             'duration': duration,
672         }
673         return [info]
674
675 class LiveLeakIE(InfoExtractor):
676
677     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
678     IE_NAME = u'liveleak'
679
680     def _real_extract(self, url):
681         mobj = re.match(self._VALID_URL, url)
682         if mobj is None:
683             raise ExtractorError(u'Invalid URL: %s' % url)
684
685         video_id = mobj.group('video_id')
686
687         webpage = self._download_webpage(url, video_id)
688
689         video_url = self._search_regex(r'file: "(.*?)",',
690             webpage, u'video URL')
691
692         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
693             webpage, u'title').replace('LiveLeak.com -', '').strip()
694
695         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
696             webpage, u'description', fatal=False)
697
698         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
699             webpage, u'uploader', fatal=False)
700
701         info = {
702             'id':  video_id,
703             'url': video_url,
704             'ext': 'mp4',
705             'title': video_title,
706             'description': video_description,
707             'uploader': video_uploader
708         }
709
710         return [info]
711
712
713
714 class TumblrIE(InfoExtractor):
715     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
716
717     def _real_extract(self, url):
718         m_url = re.match(self._VALID_URL, url)
719         video_id = m_url.group('id')
720         blog = m_url.group('blog_name')
721
722         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
723         webpage = self._download_webpage(url, video_id)
724
725         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
726         video = re.search(re_video, webpage)
727         if video is None:
728            raise ExtractorError(u'Unable to extract video')
729         video_url = video.group('video_url')
730         ext = video.group('ext')
731
732         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
733             webpage, u'thumbnail', fatal=False)  # We pick the first poster
734         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
735
736         # The only place where you can get a title, it's not complete,
737         # but searching in other places doesn't work for all videos
738         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
739             webpage, u'title', flags=re.DOTALL)
740
741         return [{'id': video_id,
742                  'url': video_url,
743                  'title': video_title,
744                  'thumbnail': video_thumbnail,
745                  'ext': ext
746                  }]
747
748 class BandcampIE(InfoExtractor):
749     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
750
751     def _real_extract(self, url):
752         mobj = re.match(self._VALID_URL, url)
753         title = mobj.group('title')
754         webpage = self._download_webpage(url, title)
755         # We get the link to the free download page
756         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
757         if m_download is None:
758             raise ExtractorError(u'No free songs found')
759
760         download_link = m_download.group(1)
761         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
762                        webpage, re.MULTILINE|re.DOTALL).group('id')
763
764         download_webpage = self._download_webpage(download_link, id,
765                                                   'Downloading free downloads page')
766         # We get the dictionary of the track from some javascrip code
767         info = re.search(r'items: (.*?),$',
768                          download_webpage, re.MULTILINE).group(1)
769         info = json.loads(info)[0]
770         # We pick mp3-320 for now, until format selection can be easily implemented.
771         mp3_info = info[u'downloads'][u'mp3-320']
772         # If we try to use this url it says the link has expired
773         initial_url = mp3_info[u'url']
774         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
775         m_url = re.match(re_url, initial_url)
776         #We build the url we will use to get the final track url
777         # This url is build in Bandcamp in the script download_bunde_*.js
778         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
779         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
780         # If we could correctly generate the .rand field the url would be
781         #in the "download_url" key
782         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
783
784         track_info = {'id':id,
785                       'title' : info[u'title'],
786                       'ext' :   'mp3',
787                       'url' :   final_url,
788                       'thumbnail' : info[u'thumb_url'],
789                       'uploader' :  info[u'artist']
790                       }
791
792         return [track_info]
793
794 class RedTubeIE(InfoExtractor):
795     """Information Extractor for redtube"""
796     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
797
798     def _real_extract(self,url):
799         mobj = re.match(self._VALID_URL, url)
800         if mobj is None:
801             raise ExtractorError(u'Invalid URL: %s' % url)
802
803         video_id = mobj.group('id')
804         video_extension = 'mp4'        
805         webpage = self._download_webpage(url, video_id)
806
807         self.report_extraction(video_id)
808
809         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
810             webpage, u'video URL')
811
812         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
813             webpage, u'title')
814
815         return [{
816             'id':       video_id,
817             'url':      video_url,
818             'ext':      video_extension,
819             'title':    video_title,
820         }]
821         
822 class InaIE(InfoExtractor):
823     """Information Extractor for Ina.fr"""
824     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
825
826     def _real_extract(self,url):
827         mobj = re.match(self._VALID_URL, url)
828
829         video_id = mobj.group('id')
830         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
831         video_extension = 'mp4'
832         webpage = self._download_webpage(mrss_url, video_id)
833
834         self.report_extraction(video_id)
835
836         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
837             webpage, u'video URL')
838
839         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
840             webpage, u'title')
841
842         return [{
843             'id':       video_id,
844             'url':      video_url,
845             'ext':      video_extension,
846             'title':    video_title,
847         }]
848
849 class HowcastIE(InfoExtractor):
850     """Information Extractor for Howcast.com"""
851     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
852
853     def _real_extract(self, url):
854         mobj = re.match(self._VALID_URL, url)
855
856         video_id = mobj.group('id')
857         webpage_url = 'http://www.howcast.com/videos/' + video_id
858         webpage = self._download_webpage(webpage_url, video_id)
859
860         self.report_extraction(video_id)
861
862         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
863             webpage, u'video URL')
864
865         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
866             webpage, u'title')
867
868         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
869             webpage, u'description', fatal=False)
870
871         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
872             webpage, u'thumbnail', fatal=False)
873
874         return [{
875             'id':       video_id,
876             'url':      video_url,
877             'ext':      'mp4',
878             'title':    video_title,
879             'description': video_description,
880             'thumbnail': thumbnail,
881         }]
882
883 class VineIE(InfoExtractor):
884     """Information Extractor for Vine.co"""
885     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
886
887     def _real_extract(self, url):
888         mobj = re.match(self._VALID_URL, url)
889
890         video_id = mobj.group('id')
891         webpage_url = 'https://vine.co/v/' + video_id
892         webpage = self._download_webpage(webpage_url, video_id)
893
894         self.report_extraction(video_id)
895
896         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
897             webpage, u'video URL')
898
899         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
900             webpage, u'title')
901
902         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
903             webpage, u'thumbnail', fatal=False)
904
905         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
906             webpage, u'uploader', fatal=False, flags=re.DOTALL)
907
908         return [{
909             'id':        video_id,
910             'url':       video_url,
911             'ext':       'mp4',
912             'title':     video_title,
913             'thumbnail': thumbnail,
914             'uploader':  uploader,
915         }]
916
917 class FlickrIE(InfoExtractor):
918     """Information Extractor for Flickr videos"""
919     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
920
921     def _real_extract(self, url):
922         mobj = re.match(self._VALID_URL, url)
923
924         video_id = mobj.group('id')
925         video_uploader_id = mobj.group('uploader_id')
926         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
927         webpage = self._download_webpage(webpage_url, video_id)
928
929         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
930
931         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
932         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
933
934         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
935             first_xml, u'node_id')
936
937         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
938         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
939
940         self.report_extraction(video_id)
941
942         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
943         if mobj is None:
944             raise ExtractorError(u'Unable to extract video url')
945         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
946
947         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
948             webpage, u'video title')
949
950         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
951             webpage, u'description', fatal=False)
952
953         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
954             webpage, u'thumbnail', fatal=False)
955
956         return [{
957             'id':          video_id,
958             'url':         video_url,
959             'ext':         'mp4',
960             'title':       video_title,
961             'description': video_description,
962             'thumbnail':   thumbnail,
963             'uploader_id': video_uploader_id,
964         }]
965
966 class TeamcocoIE(InfoExtractor):
967     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
968
969     def _real_extract(self, url):
970         mobj = re.match(self._VALID_URL, url)
971         if mobj is None:
972             raise ExtractorError(u'Invalid URL: %s' % url)
973         url_title = mobj.group('url_title')
974         webpage = self._download_webpage(url, url_title)
975
976         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
977             webpage, u'video id')
978
979         self.report_extraction(video_id)
980
981         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
982             webpage, u'title')
983
984         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
985             webpage, u'thumbnail', fatal=False)
986
987         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
988             webpage, u'description', fatal=False)
989
990         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
991         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
992
993         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
994             data, u'video URL')
995
996         return [{
997             'id':          video_id,
998             'url':         video_url,
999             'ext':         'mp4',
1000             'title':       video_title,
1001             'thumbnail':   thumbnail,
1002             'description': video_description,
1003         }]
1004
1005 class XHamsterIE(InfoExtractor):
1006     """Information Extractor for xHamster"""
1007     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1008
1009     def _real_extract(self,url):
1010         mobj = re.match(self._VALID_URL, url)
1011
1012         video_id = mobj.group('id')
1013         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1014         webpage = self._download_webpage(mrss_url, video_id)
1015
1016         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1017         if mobj is None:
1018             raise ExtractorError(u'Unable to extract media URL')
1019         if len(mobj.group('server')) == 0:
1020             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1021         else:
1022             video_url = mobj.group('server')+'/key='+mobj.group('file')
1023         video_extension = video_url.split('.')[-1]
1024
1025         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1026             webpage, u'title')
1027
1028         # Can't see the description anywhere in the UI
1029         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1030         #     webpage, u'description', fatal=False)
1031         # if video_description: video_description = unescapeHTML(video_description)
1032
1033         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1034         if mobj:
1035             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1036         else:
1037             video_upload_date = None
1038             self._downloader.report_warning(u'Unable to extract upload date')
1039
1040         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1041             webpage, u'uploader id', default=u'anonymous')
1042
1043         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1044             webpage, u'thumbnail', fatal=False)
1045
1046         return [{
1047             'id':       video_id,
1048             'url':      video_url,
1049             'ext':      video_extension,
1050             'title':    video_title,
1051             # 'description': video_description,
1052             'upload_date': video_upload_date,
1053             'uploader_id': video_uploader_id,
1054             'thumbnail': video_thumbnail
1055         }]
1056
1057 class HypemIE(InfoExtractor):
1058     """Information Extractor for hypem"""
1059     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1060
1061     def _real_extract(self, url):
1062         mobj = re.match(self._VALID_URL, url)
1063         if mobj is None:
1064             raise ExtractorError(u'Invalid URL: %s' % url)
1065         track_id = mobj.group(1)
1066
1067         data = { 'ax': 1, 'ts': time.time() }
1068         data_encoded = compat_urllib_parse.urlencode(data)
1069         complete_url = url + "?" + data_encoded
1070         request = compat_urllib_request.Request(complete_url)
1071         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1072         cookie = urlh.headers.get('Set-Cookie', '')
1073
1074         self.report_extraction(track_id)
1075
1076         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1077             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1078         try:
1079             track_list = json.loads(html_tracks)
1080             track = track_list[u'tracks'][0]
1081         except ValueError:
1082             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1083
1084         key = track[u"key"]
1085         track_id = track[u"id"]
1086         artist = track[u"artist"]
1087         title = track[u"song"]
1088
1089         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1090         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1091         request.add_header('cookie', cookie)
1092         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1093         try:
1094             song_data = json.loads(song_data_json)
1095         except ValueError:
1096             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1097         final_url = song_data[u"url"]
1098
1099         return [{
1100             'id':       track_id,
1101             'url':      final_url,
1102             'ext':      "mp3",
1103             'title':    title,
1104             'artist':   artist,
1105         }]
1106
1107 class Vbox7IE(InfoExtractor):
1108     """Information Extractor for Vbox7"""
1109     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1110
1111     def _real_extract(self,url):
1112         mobj = re.match(self._VALID_URL, url)
1113         if mobj is None:
1114             raise ExtractorError(u'Invalid URL: %s' % url)
1115         video_id = mobj.group(1)
1116
1117         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1118         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1119         redirect_url = urlh.geturl() + new_location
1120         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1121
1122         title = self._html_search_regex(r'<title>(.*)</title>',
1123             webpage, u'title').split('/')[0].strip()
1124
1125         ext = "flv"
1126         info_url = "http://vbox7.com/play/magare.do"
1127         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1128         info_request = compat_urllib_request.Request(info_url, data)
1129         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1130         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1131         if info_response is None:
1132             raise ExtractorError(u'Unable to extract the media url')
1133         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1134
1135         return [{
1136             'id':        video_id,
1137             'url':       final_url,
1138             'ext':       ext,
1139             'title':     title,
1140             'thumbnail': thumbnail_url,
1141         }]
1142
1143
1144 def gen_extractors():
1145     """ Return a list of an instance of every supported extractor.
1146     The order does matter; the first extractor matched is the one handling the URL.
1147     """
1148     return [
1149         YoutubePlaylistIE(),
1150         YoutubeChannelIE(),
1151         YoutubeUserIE(),
1152         YoutubeSearchIE(),
1153         YoutubeIE(),
1154         MetacafeIE(),
1155         DailymotionIE(),
1156         GoogleSearchIE(),
1157         PhotobucketIE(),
1158         YahooIE(),
1159         YahooSearchIE(),
1160         DepositFilesIE(),
1161         FacebookIE(),
1162         BlipTVIE(),
1163         BlipTVUserIE(),
1164         VimeoIE(),
1165         MyVideoIE(),
1166         ComedyCentralIE(),
1167         EscapistIE(),
1168         CollegeHumorIE(),
1169         XVideosIE(),
1170         SoundcloudSetIE(),
1171         SoundcloudIE(),
1172         InfoQIE(),
1173         MixcloudIE(),
1174         StanfordOpenClassroomIE(),
1175         MTVIE(),
1176         YoukuIE(),
1177         XNXXIE(),
1178         YouJizzIE(),
1179         PornotubeIE(),
1180         YouPornIE(),
1181         GooglePlusIE(),
1182         ArteTvIE(),
1183         NBAIE(),
1184         WorldStarHipHopIE(),
1185         JustinTVIE(),
1186         FunnyOrDieIE(),
1187         SteamIE(),
1188         UstreamIE(),
1189         RBMARadioIE(),
1190         EightTracksIE(),
1191         KeekIE(),
1192         TEDIE(),
1193         MySpassIE(),
1194         SpiegelIE(),
1195         LiveLeakIE(),
1196         ARDIE(),
1197         ZDFIE(),
1198         TumblrIE(),
1199         BandcampIE(),
1200         RedTubeIE(),
1201         InaIE(),
1202         HowcastIE(),
1203         VineIE(),
1204         FlickrIE(),
1205         TeamcocoIE(),
1206         XHamsterIE(),
1207         HypemIE(),
1208         Vbox7IE(),
1209         GametrailersIE(),
1210         StatigramIE(),
1211         GenericIE()
1212     ]
1213
1214 def get_info_extractor(ie_name):
1215     """Returns the info extractor class with the given ie_name"""
1216     return globals()[ie_name+'IE']