Move FunnyOrDie into its own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.funnyordie import FunnyOrDieIE
31 from .extractor.gametrailers import GametrailersIE
32 from .extractor.generic import GenericIE
33 from .extractor.googleplus import GooglePlusIE
34 from .extractor.googlesearch import GoogleSearchIE
35 from .extractor.infoq import InfoQIE
36 from .extractor.metacafe import MetacafeIE
37 from .extractor.mixcloud import MixcloudIE
38 from .extractor.mtv import MTVIE
39 from .extractor.myvideo import MyVideoIE
40 from .extractor.nba import NBAIE
41 from .extractor.statigram import StatigramIE
42 from .extractor.photobucket import PhotobucketIE
43 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
44 from .extractor.stanfordoc import StanfordOpenClassroomIE
45 from .extractor.steam import SteamIE
46 from .extractor.ted import TEDIE
47 from .extractor.vimeo import VimeoIE
48 from .extractor.worldstarhiphop import WorldStarHipHopIE
49 from .extractor.xnxx import XNXXIE
50 from .extractor.xvideos import XVideosIE
51 from .extractor.yahoo import YahooIE, YahooSearchIE
52 from .extractor.youku import YoukuIE
53 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
54 from .extractor.zdf import ZDFIE
55
56
57
58
59
60
61
62
63
64
65
66
67
68 class JustinTVIE(InfoExtractor):
69     """Information extractor for justin.tv and twitch.tv"""
70     # TODO: One broadcast may be split into multiple videos. The key
71     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
72     # starts at 1 and increases. Can we treat all parts as one video?
73
74     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
75         (?:
76             (?P<channelid>[^/]+)|
77             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
78             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
79         )
80         /?(?:\#.*)?$
81         """
82     _JUSTIN_PAGE_LIMIT = 100
83     IE_NAME = u'justin.tv'
84
85     def report_download_page(self, channel, offset):
86         """Report attempt to download a single page of videos."""
87         self.to_screen(u'%s: Downloading video information from %d to %d' %
88                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
89
90     # Return count of items, list of *valid* items
91     def _parse_page(self, url, video_id):
92         webpage = self._download_webpage(url, video_id,
93                                          u'Downloading video info JSON',
94                                          u'unable to download video info JSON')
95
96         response = json.loads(webpage)
97         if type(response) != list:
98             error_text = response.get('error', 'unknown error')
99             raise ExtractorError(u'Justin.tv API: %s' % error_text)
100         info = []
101         for clip in response:
102             video_url = clip['video_file_url']
103             if video_url:
104                 video_extension = os.path.splitext(video_url)[1][1:]
105                 video_date = re.sub('-', '', clip['start_time'][:10])
106                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
107                 video_id = clip['id']
108                 video_title = clip.get('title', video_id)
109                 info.append({
110                     'id': video_id,
111                     'url': video_url,
112                     'title': video_title,
113                     'uploader': clip.get('channel_name', video_uploader_id),
114                     'uploader_id': video_uploader_id,
115                     'upload_date': video_date,
116                     'ext': video_extension,
117                 })
118         return (len(response), info)
119
120     def _real_extract(self, url):
121         mobj = re.match(self._VALID_URL, url)
122         if mobj is None:
123             raise ExtractorError(u'invalid URL: %s' % url)
124
125         api_base = 'http://api.justin.tv'
126         paged = False
127         if mobj.group('channelid'):
128             paged = True
129             video_id = mobj.group('channelid')
130             api = api_base + '/channel/archives/%s.json' % video_id
131         elif mobj.group('chapterid'):
132             chapter_id = mobj.group('chapterid')
133
134             webpage = self._download_webpage(url, chapter_id)
135             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
136             if not m:
137                 raise ExtractorError(u'Cannot find archive of a chapter')
138             archive_id = m.group(1)
139
140             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
141             chapter_info_xml = self._download_webpage(api, chapter_id,
142                                              note=u'Downloading chapter information',
143                                              errnote=u'Chapter information download failed')
144             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
145             for a in doc.findall('.//archive'):
146                 if archive_id == a.find('./id').text:
147                     break
148             else:
149                 raise ExtractorError(u'Could not find chapter in chapter information')
150
151             video_url = a.find('./video_file_url').text
152             video_ext = video_url.rpartition('.')[2] or u'flv'
153
154             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
155             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
156                                    note='Downloading chapter metadata',
157                                    errnote='Download of chapter metadata failed')
158             chapter_info = json.loads(chapter_info_json)
159
160             bracket_start = int(doc.find('.//bracket_start').text)
161             bracket_end = int(doc.find('.//bracket_end').text)
162
163             # TODO determine start (and probably fix up file)
164             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
165             #video_url += u'?start=' + TODO:start_timestamp
166             # bracket_start is 13290, but we want 51670615
167             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
168                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
169
170             info = {
171                 'id': u'c' + chapter_id,
172                 'url': video_url,
173                 'ext': video_ext,
174                 'title': chapter_info['title'],
175                 'thumbnail': chapter_info['preview'],
176                 'description': chapter_info['description'],
177                 'uploader': chapter_info['channel']['display_name'],
178                 'uploader_id': chapter_info['channel']['name'],
179             }
180             return [info]
181         else:
182             video_id = mobj.group('videoid')
183             api = api_base + '/broadcast/by_archive/%s.json' % video_id
184
185         self.report_extraction(video_id)
186
187         info = []
188         offset = 0
189         limit = self._JUSTIN_PAGE_LIMIT
190         while True:
191             if paged:
192                 self.report_download_page(video_id, offset)
193             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
194             page_count, page_info = self._parse_page(page_url, video_id)
195             info.extend(page_info)
196             if not paged or page_count != limit:
197                 break
198             offset += limit
199         return info
200
201
202
203 class UstreamIE(InfoExtractor):
204     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
205     IE_NAME = u'ustream'
206
207     def _real_extract(self, url):
208         m = re.match(self._VALID_URL, url)
209         video_id = m.group('videoID')
210
211         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
212         webpage = self._download_webpage(url, video_id)
213
214         self.report_extraction(video_id)
215
216         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
217             webpage, u'title')
218
219         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
220             webpage, u'uploader', fatal=False, flags=re.DOTALL)
221
222         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
223             webpage, u'thumbnail', fatal=False)
224
225         info = {
226                 'id': video_id,
227                 'url': video_url,
228                 'ext': 'flv',
229                 'title': video_title,
230                 'uploader': uploader,
231                 'thumbnail': thumbnail,
232                }
233         return info
234
235
236 class RBMARadioIE(InfoExtractor):
237     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
238
239     def _real_extract(self, url):
240         m = re.match(self._VALID_URL, url)
241         video_id = m.group('videoID')
242
243         webpage = self._download_webpage(url, video_id)
244
245         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
246             webpage, u'json data', flags=re.MULTILINE)
247
248         try:
249             data = json.loads(json_data)
250         except ValueError as e:
251             raise ExtractorError(u'Invalid JSON: ' + str(e))
252
253         video_url = data['akamai_url'] + '&cbr=256'
254         url_parts = compat_urllib_parse_urlparse(video_url)
255         video_ext = url_parts.path.rpartition('.')[2]
256         info = {
257                 'id': video_id,
258                 'url': video_url,
259                 'ext': video_ext,
260                 'title': data['title'],
261                 'description': data.get('teaser_text'),
262                 'location': data.get('country_of_origin'),
263                 'uploader': data.get('host', {}).get('name'),
264                 'uploader_id': data.get('host', {}).get('slug'),
265                 'thumbnail': data.get('image', {}).get('large_url_2x'),
266                 'duration': data.get('duration'),
267         }
268         return [info]
269
270
271 class YouPornIE(InfoExtractor):
272     """Information extractor for youporn.com."""
273     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
274
275     def _print_formats(self, formats):
276         """Print all available formats"""
277         print(u'Available formats:')
278         print(u'ext\t\tformat')
279         print(u'---------------------------------')
280         for format in formats:
281             print(u'%s\t\t%s'  % (format['ext'], format['format']))
282
283     def _specific(self, req_format, formats):
284         for x in formats:
285             if(x["format"]==req_format):
286                 return x
287         return None
288
289     def _real_extract(self, url):
290         mobj = re.match(self._VALID_URL, url)
291         if mobj is None:
292             raise ExtractorError(u'Invalid URL: %s' % url)
293         video_id = mobj.group('videoid')
294
295         req = compat_urllib_request.Request(url)
296         req.add_header('Cookie', 'age_verified=1')
297         webpage = self._download_webpage(req, video_id)
298
299         # Get JSON parameters
300         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
301         try:
302             params = json.loads(json_params)
303         except:
304             raise ExtractorError(u'Invalid JSON')
305
306         self.report_extraction(video_id)
307         try:
308             video_title = params['title']
309             upload_date = unified_strdate(params['release_date_f'])
310             video_description = params['description']
311             video_uploader = params['submitted_by']
312             thumbnail = params['thumbnails'][0]['image']
313         except KeyError:
314             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
315
316         # Get all of the formats available
317         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
318         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
319             webpage, u'download list').strip()
320
321         # Get all of the links from the page
322         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
323         links = re.findall(LINK_RE, download_list_html)
324         if(len(links) == 0):
325             raise ExtractorError(u'ERROR: no known formats available for video')
326
327         self.to_screen(u'Links found: %d' % len(links))
328
329         formats = []
330         for link in links:
331
332             # A link looks like this:
333             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
334             # A path looks like this:
335             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
336             video_url = unescapeHTML( link )
337             path = compat_urllib_parse_urlparse( video_url ).path
338             extension = os.path.splitext( path )[1][1:]
339             format = path.split('/')[4].split('_')[:2]
340             size = format[0]
341             bitrate = format[1]
342             format = "-".join( format )
343             # title = u'%s-%s-%s' % (video_title, size, bitrate)
344
345             formats.append({
346                 'id': video_id,
347                 'url': video_url,
348                 'uploader': video_uploader,
349                 'upload_date': upload_date,
350                 'title': video_title,
351                 'ext': extension,
352                 'format': format,
353                 'thumbnail': thumbnail,
354                 'description': video_description
355             })
356
357         if self._downloader.params.get('listformats', None):
358             self._print_formats(formats)
359             return
360
361         req_format = self._downloader.params.get('format', None)
362         self.to_screen(u'Format: %s' % req_format)
363
364         if req_format is None or req_format == 'best':
365             return [formats[0]]
366         elif req_format == 'worst':
367             return [formats[-1]]
368         elif req_format in ('-1', 'all'):
369             return formats
370         else:
371             format = self._specific( req_format, formats )
372             if result is None:
373                 raise ExtractorError(u'Requested format not available')
374             return [format]
375
376
377
378 class PornotubeIE(InfoExtractor):
379     """Information extractor for pornotube.com."""
380     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
381
382     def _real_extract(self, url):
383         mobj = re.match(self._VALID_URL, url)
384         if mobj is None:
385             raise ExtractorError(u'Invalid URL: %s' % url)
386
387         video_id = mobj.group('videoid')
388         video_title = mobj.group('title')
389
390         # Get webpage content
391         webpage = self._download_webpage(url, video_id)
392
393         # Get the video URL
394         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
395         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
396         video_url = compat_urllib_parse.unquote(video_url)
397
398         #Get the uploaded date
399         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
400         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
401         if upload_date: upload_date = unified_strdate(upload_date)
402
403         info = {'id': video_id,
404                 'url': video_url,
405                 'uploader': None,
406                 'upload_date': upload_date,
407                 'title': video_title,
408                 'ext': 'flv',
409                 'format': 'flv'}
410
411         return [info]
412
413 class YouJizzIE(InfoExtractor):
414     """Information extractor for youjizz.com."""
415     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
416
417     def _real_extract(self, url):
418         mobj = re.match(self._VALID_URL, url)
419         if mobj is None:
420             raise ExtractorError(u'Invalid URL: %s' % url)
421
422         video_id = mobj.group('videoid')
423
424         # Get webpage content
425         webpage = self._download_webpage(url, video_id)
426
427         # Get the video title
428         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
429             webpage, u'title').strip()
430
431         # Get the embed page
432         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
433         if result is None:
434             raise ExtractorError(u'ERROR: unable to extract embed page')
435
436         embed_page_url = result.group(0).strip()
437         video_id = result.group('videoid')
438
439         webpage = self._download_webpage(embed_page_url, video_id)
440
441         # Get the video URL
442         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
443             webpage, u'video URL')
444
445         info = {'id': video_id,
446                 'url': video_url,
447                 'title': video_title,
448                 'ext': 'flv',
449                 'format': 'flv',
450                 'player_url': embed_page_url}
451
452         return [info]
453
454 class EightTracksIE(InfoExtractor):
455     IE_NAME = '8tracks'
456     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
457
458     def _real_extract(self, url):
459         mobj = re.match(self._VALID_URL, url)
460         if mobj is None:
461             raise ExtractorError(u'Invalid URL: %s' % url)
462         playlist_id = mobj.group('id')
463
464         webpage = self._download_webpage(url, playlist_id)
465
466         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
467         data = json.loads(json_like)
468
469         session = str(random.randint(0, 1000000000))
470         mix_id = data['id']
471         track_count = data['tracks_count']
472         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
473         next_url = first_url
474         res = []
475         for i in itertools.count():
476             api_json = self._download_webpage(next_url, playlist_id,
477                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
478                 errnote=u'Failed to download song information')
479             api_data = json.loads(api_json)
480             track_data = api_data[u'set']['track']
481             info = {
482                 'id': track_data['id'],
483                 'url': track_data['track_file_stream_url'],
484                 'title': track_data['performer'] + u' - ' + track_data['name'],
485                 'raw_title': track_data['name'],
486                 'uploader_id': data['user']['login'],
487                 'ext': 'm4a',
488             }
489             res.append(info)
490             if api_data['set']['at_last_track']:
491                 break
492             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
493         return res
494
495 class KeekIE(InfoExtractor):
496     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
497     IE_NAME = u'keek'
498
499     def _real_extract(self, url):
500         m = re.match(self._VALID_URL, url)
501         video_id = m.group('videoID')
502
503         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
504         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
505         webpage = self._download_webpage(url, video_id)
506
507         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
508             webpage, u'title')
509
510         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
511             webpage, u'uploader', fatal=False)
512
513         info = {
514                 'id': video_id,
515                 'url': video_url,
516                 'ext': 'mp4',
517                 'title': video_title,
518                 'thumbnail': thumbnail,
519                 'uploader': uploader
520         }
521         return [info]
522
523
524 class MySpassIE(InfoExtractor):
525     _VALID_URL = r'http://www.myspass.de/.*'
526
527     def _real_extract(self, url):
528         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
529
530         # video id is the last path element of the URL
531         # usually there is a trailing slash, so also try the second but last
532         url_path = compat_urllib_parse_urlparse(url).path
533         url_parent_path, video_id = os.path.split(url_path)
534         if not video_id:
535             _, video_id = os.path.split(url_parent_path)
536
537         # get metadata
538         metadata_url = META_DATA_URL_TEMPLATE % video_id
539         metadata_text = self._download_webpage(metadata_url, video_id)
540         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
541
542         # extract values from metadata
543         url_flv_el = metadata.find('url_flv')
544         if url_flv_el is None:
545             raise ExtractorError(u'Unable to extract download url')
546         video_url = url_flv_el.text
547         extension = os.path.splitext(video_url)[1][1:]
548         title_el = metadata.find('title')
549         if title_el is None:
550             raise ExtractorError(u'Unable to extract title')
551         title = title_el.text
552         format_id_el = metadata.find('format_id')
553         if format_id_el is None:
554             format = ext
555         else:
556             format = format_id_el.text
557         description_el = metadata.find('description')
558         if description_el is not None:
559             description = description_el.text
560         else:
561             description = None
562         imagePreview_el = metadata.find('imagePreview')
563         if imagePreview_el is not None:
564             thumbnail = imagePreview_el.text
565         else:
566             thumbnail = None
567         info = {
568             'id': video_id,
569             'url': video_url,
570             'title': title,
571             'ext': extension,
572             'format': format,
573             'thumbnail': thumbnail,
574             'description': description
575         }
576         return [info]
577
578 class SpiegelIE(InfoExtractor):
579     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
580
581     def _real_extract(self, url):
582         m = re.match(self._VALID_URL, url)
583         video_id = m.group('videoID')
584
585         webpage = self._download_webpage(url, video_id)
586
587         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
588             webpage, u'title')
589
590         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
591         xml_code = self._download_webpage(xml_url, video_id,
592                     note=u'Downloading XML', errnote=u'Failed to download XML')
593
594         idoc = xml.etree.ElementTree.fromstring(xml_code)
595         last_type = idoc[-1]
596         filename = last_type.findall('./filename')[0].text
597         duration = float(last_type.findall('./duration')[0].text)
598
599         video_url = 'http://video2.spiegel.de/flash/' + filename
600         video_ext = filename.rpartition('.')[2]
601         info = {
602             'id': video_id,
603             'url': video_url,
604             'ext': video_ext,
605             'title': video_title,
606             'duration': duration,
607         }
608         return [info]
609
610 class LiveLeakIE(InfoExtractor):
611
612     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
613     IE_NAME = u'liveleak'
614
615     def _real_extract(self, url):
616         mobj = re.match(self._VALID_URL, url)
617         if mobj is None:
618             raise ExtractorError(u'Invalid URL: %s' % url)
619
620         video_id = mobj.group('video_id')
621
622         webpage = self._download_webpage(url, video_id)
623
624         video_url = self._search_regex(r'file: "(.*?)",',
625             webpage, u'video URL')
626
627         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
628             webpage, u'title').replace('LiveLeak.com -', '').strip()
629
630         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
631             webpage, u'description', fatal=False)
632
633         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
634             webpage, u'uploader', fatal=False)
635
636         info = {
637             'id':  video_id,
638             'url': video_url,
639             'ext': 'mp4',
640             'title': video_title,
641             'description': video_description,
642             'uploader': video_uploader
643         }
644
645         return [info]
646
647
648
649 class TumblrIE(InfoExtractor):
650     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
651
652     def _real_extract(self, url):
653         m_url = re.match(self._VALID_URL, url)
654         video_id = m_url.group('id')
655         blog = m_url.group('blog_name')
656
657         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
658         webpage = self._download_webpage(url, video_id)
659
660         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
661         video = re.search(re_video, webpage)
662         if video is None:
663            raise ExtractorError(u'Unable to extract video')
664         video_url = video.group('video_url')
665         ext = video.group('ext')
666
667         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
668             webpage, u'thumbnail', fatal=False)  # We pick the first poster
669         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
670
671         # The only place where you can get a title, it's not complete,
672         # but searching in other places doesn't work for all videos
673         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
674             webpage, u'title', flags=re.DOTALL)
675
676         return [{'id': video_id,
677                  'url': video_url,
678                  'title': video_title,
679                  'thumbnail': video_thumbnail,
680                  'ext': ext
681                  }]
682
683 class BandcampIE(InfoExtractor):
684     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
685
686     def _real_extract(self, url):
687         mobj = re.match(self._VALID_URL, url)
688         title = mobj.group('title')
689         webpage = self._download_webpage(url, title)
690         # We get the link to the free download page
691         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
692         if m_download is None:
693             raise ExtractorError(u'No free songs found')
694
695         download_link = m_download.group(1)
696         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
697                        webpage, re.MULTILINE|re.DOTALL).group('id')
698
699         download_webpage = self._download_webpage(download_link, id,
700                                                   'Downloading free downloads page')
701         # We get the dictionary of the track from some javascrip code
702         info = re.search(r'items: (.*?),$',
703                          download_webpage, re.MULTILINE).group(1)
704         info = json.loads(info)[0]
705         # We pick mp3-320 for now, until format selection can be easily implemented.
706         mp3_info = info[u'downloads'][u'mp3-320']
707         # If we try to use this url it says the link has expired
708         initial_url = mp3_info[u'url']
709         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
710         m_url = re.match(re_url, initial_url)
711         #We build the url we will use to get the final track url
712         # This url is build in Bandcamp in the script download_bunde_*.js
713         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
714         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
715         # If we could correctly generate the .rand field the url would be
716         #in the "download_url" key
717         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
718
719         track_info = {'id':id,
720                       'title' : info[u'title'],
721                       'ext' :   'mp3',
722                       'url' :   final_url,
723                       'thumbnail' : info[u'thumb_url'],
724                       'uploader' :  info[u'artist']
725                       }
726
727         return [track_info]
728
729 class RedTubeIE(InfoExtractor):
730     """Information Extractor for redtube"""
731     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
732
733     def _real_extract(self,url):
734         mobj = re.match(self._VALID_URL, url)
735         if mobj is None:
736             raise ExtractorError(u'Invalid URL: %s' % url)
737
738         video_id = mobj.group('id')
739         video_extension = 'mp4'        
740         webpage = self._download_webpage(url, video_id)
741
742         self.report_extraction(video_id)
743
744         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
745             webpage, u'video URL')
746
747         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
748             webpage, u'title')
749
750         return [{
751             'id':       video_id,
752             'url':      video_url,
753             'ext':      video_extension,
754             'title':    video_title,
755         }]
756         
757 class InaIE(InfoExtractor):
758     """Information Extractor for Ina.fr"""
759     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
760
761     def _real_extract(self,url):
762         mobj = re.match(self._VALID_URL, url)
763
764         video_id = mobj.group('id')
765         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
766         video_extension = 'mp4'
767         webpage = self._download_webpage(mrss_url, video_id)
768
769         self.report_extraction(video_id)
770
771         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
772             webpage, u'video URL')
773
774         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
775             webpage, u'title')
776
777         return [{
778             'id':       video_id,
779             'url':      video_url,
780             'ext':      video_extension,
781             'title':    video_title,
782         }]
783
784 class HowcastIE(InfoExtractor):
785     """Information Extractor for Howcast.com"""
786     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
787
788     def _real_extract(self, url):
789         mobj = re.match(self._VALID_URL, url)
790
791         video_id = mobj.group('id')
792         webpage_url = 'http://www.howcast.com/videos/' + video_id
793         webpage = self._download_webpage(webpage_url, video_id)
794
795         self.report_extraction(video_id)
796
797         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
798             webpage, u'video URL')
799
800         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
801             webpage, u'title')
802
803         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
804             webpage, u'description', fatal=False)
805
806         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
807             webpage, u'thumbnail', fatal=False)
808
809         return [{
810             'id':       video_id,
811             'url':      video_url,
812             'ext':      'mp4',
813             'title':    video_title,
814             'description': video_description,
815             'thumbnail': thumbnail,
816         }]
817
818 class VineIE(InfoExtractor):
819     """Information Extractor for Vine.co"""
820     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
821
822     def _real_extract(self, url):
823         mobj = re.match(self._VALID_URL, url)
824
825         video_id = mobj.group('id')
826         webpage_url = 'https://vine.co/v/' + video_id
827         webpage = self._download_webpage(webpage_url, video_id)
828
829         self.report_extraction(video_id)
830
831         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
832             webpage, u'video URL')
833
834         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
835             webpage, u'title')
836
837         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
838             webpage, u'thumbnail', fatal=False)
839
840         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
841             webpage, u'uploader', fatal=False, flags=re.DOTALL)
842
843         return [{
844             'id':        video_id,
845             'url':       video_url,
846             'ext':       'mp4',
847             'title':     video_title,
848             'thumbnail': thumbnail,
849             'uploader':  uploader,
850         }]
851
852 class FlickrIE(InfoExtractor):
853     """Information Extractor for Flickr videos"""
854     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
855
856     def _real_extract(self, url):
857         mobj = re.match(self._VALID_URL, url)
858
859         video_id = mobj.group('id')
860         video_uploader_id = mobj.group('uploader_id')
861         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
862         webpage = self._download_webpage(webpage_url, video_id)
863
864         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
865
866         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
867         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
868
869         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
870             first_xml, u'node_id')
871
872         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
873         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
874
875         self.report_extraction(video_id)
876
877         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
878         if mobj is None:
879             raise ExtractorError(u'Unable to extract video url')
880         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
881
882         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
883             webpage, u'video title')
884
885         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
886             webpage, u'description', fatal=False)
887
888         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
889             webpage, u'thumbnail', fatal=False)
890
891         return [{
892             'id':          video_id,
893             'url':         video_url,
894             'ext':         'mp4',
895             'title':       video_title,
896             'description': video_description,
897             'thumbnail':   thumbnail,
898             'uploader_id': video_uploader_id,
899         }]
900
901 class TeamcocoIE(InfoExtractor):
902     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
903
904     def _real_extract(self, url):
905         mobj = re.match(self._VALID_URL, url)
906         if mobj is None:
907             raise ExtractorError(u'Invalid URL: %s' % url)
908         url_title = mobj.group('url_title')
909         webpage = self._download_webpage(url, url_title)
910
911         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
912             webpage, u'video id')
913
914         self.report_extraction(video_id)
915
916         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
917             webpage, u'title')
918
919         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
920             webpage, u'thumbnail', fatal=False)
921
922         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
923             webpage, u'description', fatal=False)
924
925         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
926         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
927
928         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
929             data, u'video URL')
930
931         return [{
932             'id':          video_id,
933             'url':         video_url,
934             'ext':         'mp4',
935             'title':       video_title,
936             'thumbnail':   thumbnail,
937             'description': video_description,
938         }]
939
940 class XHamsterIE(InfoExtractor):
941     """Information Extractor for xHamster"""
942     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
943
944     def _real_extract(self,url):
945         mobj = re.match(self._VALID_URL, url)
946
947         video_id = mobj.group('id')
948         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
949         webpage = self._download_webpage(mrss_url, video_id)
950
951         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
952         if mobj is None:
953             raise ExtractorError(u'Unable to extract media URL')
954         if len(mobj.group('server')) == 0:
955             video_url = compat_urllib_parse.unquote(mobj.group('file'))
956         else:
957             video_url = mobj.group('server')+'/key='+mobj.group('file')
958         video_extension = video_url.split('.')[-1]
959
960         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
961             webpage, u'title')
962
963         # Can't see the description anywhere in the UI
964         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
965         #     webpage, u'description', fatal=False)
966         # if video_description: video_description = unescapeHTML(video_description)
967
968         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
969         if mobj:
970             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
971         else:
972             video_upload_date = None
973             self._downloader.report_warning(u'Unable to extract upload date')
974
975         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
976             webpage, u'uploader id', default=u'anonymous')
977
978         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
979             webpage, u'thumbnail', fatal=False)
980
981         return [{
982             'id':       video_id,
983             'url':      video_url,
984             'ext':      video_extension,
985             'title':    video_title,
986             # 'description': video_description,
987             'upload_date': video_upload_date,
988             'uploader_id': video_uploader_id,
989             'thumbnail': video_thumbnail
990         }]
991
992 class HypemIE(InfoExtractor):
993     """Information Extractor for hypem"""
994     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
995
996     def _real_extract(self, url):
997         mobj = re.match(self._VALID_URL, url)
998         if mobj is None:
999             raise ExtractorError(u'Invalid URL: %s' % url)
1000         track_id = mobj.group(1)
1001
1002         data = { 'ax': 1, 'ts': time.time() }
1003         data_encoded = compat_urllib_parse.urlencode(data)
1004         complete_url = url + "?" + data_encoded
1005         request = compat_urllib_request.Request(complete_url)
1006         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1007         cookie = urlh.headers.get('Set-Cookie', '')
1008
1009         self.report_extraction(track_id)
1010
1011         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1012             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1013         try:
1014             track_list = json.loads(html_tracks)
1015             track = track_list[u'tracks'][0]
1016         except ValueError:
1017             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1018
1019         key = track[u"key"]
1020         track_id = track[u"id"]
1021         artist = track[u"artist"]
1022         title = track[u"song"]
1023
1024         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1025         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1026         request.add_header('cookie', cookie)
1027         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1028         try:
1029             song_data = json.loads(song_data_json)
1030         except ValueError:
1031             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1032         final_url = song_data[u"url"]
1033
1034         return [{
1035             'id':       track_id,
1036             'url':      final_url,
1037             'ext':      "mp3",
1038             'title':    title,
1039             'artist':   artist,
1040         }]
1041
1042 class Vbox7IE(InfoExtractor):
1043     """Information Extractor for Vbox7"""
1044     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1045
1046     def _real_extract(self,url):
1047         mobj = re.match(self._VALID_URL, url)
1048         if mobj is None:
1049             raise ExtractorError(u'Invalid URL: %s' % url)
1050         video_id = mobj.group(1)
1051
1052         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1053         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1054         redirect_url = urlh.geturl() + new_location
1055         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1056
1057         title = self._html_search_regex(r'<title>(.*)</title>',
1058             webpage, u'title').split('/')[0].strip()
1059
1060         ext = "flv"
1061         info_url = "http://vbox7.com/play/magare.do"
1062         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1063         info_request = compat_urllib_request.Request(info_url, data)
1064         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1065         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1066         if info_response is None:
1067             raise ExtractorError(u'Unable to extract the media url')
1068         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1069
1070         return [{
1071             'id':        video_id,
1072             'url':       final_url,
1073             'ext':       ext,
1074             'title':     title,
1075             'thumbnail': thumbnail_url,
1076         }]
1077
1078
1079 def gen_extractors():
1080     """ Return a list of an instance of every supported extractor.
1081     The order does matter; the first extractor matched is the one handling the URL.
1082     """
1083     return [
1084         YoutubePlaylistIE(),
1085         YoutubeChannelIE(),
1086         YoutubeUserIE(),
1087         YoutubeSearchIE(),
1088         YoutubeIE(),
1089         MetacafeIE(),
1090         DailymotionIE(),
1091         GoogleSearchIE(),
1092         PhotobucketIE(),
1093         YahooIE(),
1094         YahooSearchIE(),
1095         DepositFilesIE(),
1096         FacebookIE(),
1097         BlipTVIE(),
1098         BlipTVUserIE(),
1099         VimeoIE(),
1100         MyVideoIE(),
1101         ComedyCentralIE(),
1102         EscapistIE(),
1103         CollegeHumorIE(),
1104         XVideosIE(),
1105         SoundcloudSetIE(),
1106         SoundcloudIE(),
1107         InfoQIE(),
1108         MixcloudIE(),
1109         StanfordOpenClassroomIE(),
1110         MTVIE(),
1111         YoukuIE(),
1112         XNXXIE(),
1113         YouJizzIE(),
1114         PornotubeIE(),
1115         YouPornIE(),
1116         GooglePlusIE(),
1117         ArteTvIE(),
1118         NBAIE(),
1119         WorldStarHipHopIE(),
1120         JustinTVIE(),
1121         FunnyOrDieIE(),
1122         SteamIE(),
1123         UstreamIE(),
1124         RBMARadioIE(),
1125         EightTracksIE(),
1126         KeekIE(),
1127         TEDIE(),
1128         MySpassIE(),
1129         SpiegelIE(),
1130         LiveLeakIE(),
1131         ARDIE(),
1132         ZDFIE(),
1133         TumblrIE(),
1134         BandcampIE(),
1135         RedTubeIE(),
1136         InaIE(),
1137         HowcastIE(),
1138         VineIE(),
1139         FlickrIE(),
1140         TeamcocoIE(),
1141         XHamsterIE(),
1142         HypemIE(),
1143         Vbox7IE(),
1144         GametrailersIE(),
1145         StatigramIE(),
1146         GenericIE()
1147     ]
1148
1149 def get_info_extractor(ie_name):
1150     """Returns the info extractor class with the given ie_name"""
1151     return globals()[ie_name+'IE']