6a04735fa0a94ce27f27b540e12b84c10088daa2
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.mixcloud import MixcloudIE
37 from .extractor.mtv import MTVIE
38 from .extractor.myvideo import MyVideoIE
39 from .extractor.nba import NBAIE
40 from .extractor.statigram import StatigramIE
41 from .extractor.photobucket import PhotobucketIE
42 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
43 from .extractor.stanfordoc import StanfordOpenClassroomIE
44 from .extractor.steam import SteamIE
45 from .extractor.ted import TEDIE
46 from .extractor.vimeo import VimeoIE
47 from .extractor.worldstarhiphop import WorldStarHipHopIE
48 from .extractor.xnxx import XNXXIE
49 from .extractor.xvideos import XVideosIE
50 from .extractor.yahoo import YahooIE, YahooSearchIE
51 from .extractor.youku import YoukuIE
52 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
53 from .extractor.zdf import ZDFIE
54
55
56
57
58
59
60
61
62
63
64
65
66
67 class JustinTVIE(InfoExtractor):
68     """Information extractor for justin.tv and twitch.tv"""
69     # TODO: One broadcast may be split into multiple videos. The key
70     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
71     # starts at 1 and increases. Can we treat all parts as one video?
72
73     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
74         (?:
75             (?P<channelid>[^/]+)|
76             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
77             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
78         )
79         /?(?:\#.*)?$
80         """
81     _JUSTIN_PAGE_LIMIT = 100
82     IE_NAME = u'justin.tv'
83
84     def report_download_page(self, channel, offset):
85         """Report attempt to download a single page of videos."""
86         self.to_screen(u'%s: Downloading video information from %d to %d' %
87                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
88
89     # Return count of items, list of *valid* items
90     def _parse_page(self, url, video_id):
91         webpage = self._download_webpage(url, video_id,
92                                          u'Downloading video info JSON',
93                                          u'unable to download video info JSON')
94
95         response = json.loads(webpage)
96         if type(response) != list:
97             error_text = response.get('error', 'unknown error')
98             raise ExtractorError(u'Justin.tv API: %s' % error_text)
99         info = []
100         for clip in response:
101             video_url = clip['video_file_url']
102             if video_url:
103                 video_extension = os.path.splitext(video_url)[1][1:]
104                 video_date = re.sub('-', '', clip['start_time'][:10])
105                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
106                 video_id = clip['id']
107                 video_title = clip.get('title', video_id)
108                 info.append({
109                     'id': video_id,
110                     'url': video_url,
111                     'title': video_title,
112                     'uploader': clip.get('channel_name', video_uploader_id),
113                     'uploader_id': video_uploader_id,
114                     'upload_date': video_date,
115                     'ext': video_extension,
116                 })
117         return (len(response), info)
118
119     def _real_extract(self, url):
120         mobj = re.match(self._VALID_URL, url)
121         if mobj is None:
122             raise ExtractorError(u'invalid URL: %s' % url)
123
124         api_base = 'http://api.justin.tv'
125         paged = False
126         if mobj.group('channelid'):
127             paged = True
128             video_id = mobj.group('channelid')
129             api = api_base + '/channel/archives/%s.json' % video_id
130         elif mobj.group('chapterid'):
131             chapter_id = mobj.group('chapterid')
132
133             webpage = self._download_webpage(url, chapter_id)
134             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
135             if not m:
136                 raise ExtractorError(u'Cannot find archive of a chapter')
137             archive_id = m.group(1)
138
139             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
140             chapter_info_xml = self._download_webpage(api, chapter_id,
141                                              note=u'Downloading chapter information',
142                                              errnote=u'Chapter information download failed')
143             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
144             for a in doc.findall('.//archive'):
145                 if archive_id == a.find('./id').text:
146                     break
147             else:
148                 raise ExtractorError(u'Could not find chapter in chapter information')
149
150             video_url = a.find('./video_file_url').text
151             video_ext = video_url.rpartition('.')[2] or u'flv'
152
153             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
154             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
155                                    note='Downloading chapter metadata',
156                                    errnote='Download of chapter metadata failed')
157             chapter_info = json.loads(chapter_info_json)
158
159             bracket_start = int(doc.find('.//bracket_start').text)
160             bracket_end = int(doc.find('.//bracket_end').text)
161
162             # TODO determine start (and probably fix up file)
163             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
164             #video_url += u'?start=' + TODO:start_timestamp
165             # bracket_start is 13290, but we want 51670615
166             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
167                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
168
169             info = {
170                 'id': u'c' + chapter_id,
171                 'url': video_url,
172                 'ext': video_ext,
173                 'title': chapter_info['title'],
174                 'thumbnail': chapter_info['preview'],
175                 'description': chapter_info['description'],
176                 'uploader': chapter_info['channel']['display_name'],
177                 'uploader_id': chapter_info['channel']['name'],
178             }
179             return [info]
180         else:
181             video_id = mobj.group('videoid')
182             api = api_base + '/broadcast/by_archive/%s.json' % video_id
183
184         self.report_extraction(video_id)
185
186         info = []
187         offset = 0
188         limit = self._JUSTIN_PAGE_LIMIT
189         while True:
190             if paged:
191                 self.report_download_page(video_id, offset)
192             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
193             page_count, page_info = self._parse_page(page_url, video_id)
194             info.extend(page_info)
195             if not paged or page_count != limit:
196                 break
197             offset += limit
198         return info
199
200 class FunnyOrDieIE(InfoExtractor):
201     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
202
203     def _real_extract(self, url):
204         mobj = re.match(self._VALID_URL, url)
205         if mobj is None:
206             raise ExtractorError(u'invalid URL: %s' % url)
207
208         video_id = mobj.group('id')
209         webpage = self._download_webpage(url, video_id)
210
211         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
212             webpage, u'video URL', flags=re.DOTALL)
213
214         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
215             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
216
217         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
218             webpage, u'description', fatal=False, flags=re.DOTALL)
219
220         info = {
221             'id': video_id,
222             'url': video_url,
223             'ext': 'mp4',
224             'title': title,
225             'description': video_description,
226         }
227         return [info]
228
229
230 class UstreamIE(InfoExtractor):
231     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
232     IE_NAME = u'ustream'
233
234     def _real_extract(self, url):
235         m = re.match(self._VALID_URL, url)
236         video_id = m.group('videoID')
237
238         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
239         webpage = self._download_webpage(url, video_id)
240
241         self.report_extraction(video_id)
242
243         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
244             webpage, u'title')
245
246         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
247             webpage, u'uploader', fatal=False, flags=re.DOTALL)
248
249         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
250             webpage, u'thumbnail', fatal=False)
251
252         info = {
253                 'id': video_id,
254                 'url': video_url,
255                 'ext': 'flv',
256                 'title': video_title,
257                 'uploader': uploader,
258                 'thumbnail': thumbnail,
259                }
260         return info
261
262
263 class RBMARadioIE(InfoExtractor):
264     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
265
266     def _real_extract(self, url):
267         m = re.match(self._VALID_URL, url)
268         video_id = m.group('videoID')
269
270         webpage = self._download_webpage(url, video_id)
271
272         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
273             webpage, u'json data', flags=re.MULTILINE)
274
275         try:
276             data = json.loads(json_data)
277         except ValueError as e:
278             raise ExtractorError(u'Invalid JSON: ' + str(e))
279
280         video_url = data['akamai_url'] + '&cbr=256'
281         url_parts = compat_urllib_parse_urlparse(video_url)
282         video_ext = url_parts.path.rpartition('.')[2]
283         info = {
284                 'id': video_id,
285                 'url': video_url,
286                 'ext': video_ext,
287                 'title': data['title'],
288                 'description': data.get('teaser_text'),
289                 'location': data.get('country_of_origin'),
290                 'uploader': data.get('host', {}).get('name'),
291                 'uploader_id': data.get('host', {}).get('slug'),
292                 'thumbnail': data.get('image', {}).get('large_url_2x'),
293                 'duration': data.get('duration'),
294         }
295         return [info]
296
297
298 class YouPornIE(InfoExtractor):
299     """Information extractor for youporn.com."""
300     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
301
302     def _print_formats(self, formats):
303         """Print all available formats"""
304         print(u'Available formats:')
305         print(u'ext\t\tformat')
306         print(u'---------------------------------')
307         for format in formats:
308             print(u'%s\t\t%s'  % (format['ext'], format['format']))
309
310     def _specific(self, req_format, formats):
311         for x in formats:
312             if(x["format"]==req_format):
313                 return x
314         return None
315
316     def _real_extract(self, url):
317         mobj = re.match(self._VALID_URL, url)
318         if mobj is None:
319             raise ExtractorError(u'Invalid URL: %s' % url)
320         video_id = mobj.group('videoid')
321
322         req = compat_urllib_request.Request(url)
323         req.add_header('Cookie', 'age_verified=1')
324         webpage = self._download_webpage(req, video_id)
325
326         # Get JSON parameters
327         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
328         try:
329             params = json.loads(json_params)
330         except:
331             raise ExtractorError(u'Invalid JSON')
332
333         self.report_extraction(video_id)
334         try:
335             video_title = params['title']
336             upload_date = unified_strdate(params['release_date_f'])
337             video_description = params['description']
338             video_uploader = params['submitted_by']
339             thumbnail = params['thumbnails'][0]['image']
340         except KeyError:
341             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
342
343         # Get all of the formats available
344         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
345         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
346             webpage, u'download list').strip()
347
348         # Get all of the links from the page
349         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
350         links = re.findall(LINK_RE, download_list_html)
351         if(len(links) == 0):
352             raise ExtractorError(u'ERROR: no known formats available for video')
353
354         self.to_screen(u'Links found: %d' % len(links))
355
356         formats = []
357         for link in links:
358
359             # A link looks like this:
360             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
361             # A path looks like this:
362             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
363             video_url = unescapeHTML( link )
364             path = compat_urllib_parse_urlparse( video_url ).path
365             extension = os.path.splitext( path )[1][1:]
366             format = path.split('/')[4].split('_')[:2]
367             size = format[0]
368             bitrate = format[1]
369             format = "-".join( format )
370             # title = u'%s-%s-%s' % (video_title, size, bitrate)
371
372             formats.append({
373                 'id': video_id,
374                 'url': video_url,
375                 'uploader': video_uploader,
376                 'upload_date': upload_date,
377                 'title': video_title,
378                 'ext': extension,
379                 'format': format,
380                 'thumbnail': thumbnail,
381                 'description': video_description
382             })
383
384         if self._downloader.params.get('listformats', None):
385             self._print_formats(formats)
386             return
387
388         req_format = self._downloader.params.get('format', None)
389         self.to_screen(u'Format: %s' % req_format)
390
391         if req_format is None or req_format == 'best':
392             return [formats[0]]
393         elif req_format == 'worst':
394             return [formats[-1]]
395         elif req_format in ('-1', 'all'):
396             return formats
397         else:
398             format = self._specific( req_format, formats )
399             if result is None:
400                 raise ExtractorError(u'Requested format not available')
401             return [format]
402
403
404
405 class PornotubeIE(InfoExtractor):
406     """Information extractor for pornotube.com."""
407     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
408
409     def _real_extract(self, url):
410         mobj = re.match(self._VALID_URL, url)
411         if mobj is None:
412             raise ExtractorError(u'Invalid URL: %s' % url)
413
414         video_id = mobj.group('videoid')
415         video_title = mobj.group('title')
416
417         # Get webpage content
418         webpage = self._download_webpage(url, video_id)
419
420         # Get the video URL
421         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
422         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
423         video_url = compat_urllib_parse.unquote(video_url)
424
425         #Get the uploaded date
426         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
427         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
428         if upload_date: upload_date = unified_strdate(upload_date)
429
430         info = {'id': video_id,
431                 'url': video_url,
432                 'uploader': None,
433                 'upload_date': upload_date,
434                 'title': video_title,
435                 'ext': 'flv',
436                 'format': 'flv'}
437
438         return [info]
439
440 class YouJizzIE(InfoExtractor):
441     """Information extractor for youjizz.com."""
442     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
443
444     def _real_extract(self, url):
445         mobj = re.match(self._VALID_URL, url)
446         if mobj is None:
447             raise ExtractorError(u'Invalid URL: %s' % url)
448
449         video_id = mobj.group('videoid')
450
451         # Get webpage content
452         webpage = self._download_webpage(url, video_id)
453
454         # Get the video title
455         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
456             webpage, u'title').strip()
457
458         # Get the embed page
459         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
460         if result is None:
461             raise ExtractorError(u'ERROR: unable to extract embed page')
462
463         embed_page_url = result.group(0).strip()
464         video_id = result.group('videoid')
465
466         webpage = self._download_webpage(embed_page_url, video_id)
467
468         # Get the video URL
469         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
470             webpage, u'video URL')
471
472         info = {'id': video_id,
473                 'url': video_url,
474                 'title': video_title,
475                 'ext': 'flv',
476                 'format': 'flv',
477                 'player_url': embed_page_url}
478
479         return [info]
480
481 class EightTracksIE(InfoExtractor):
482     IE_NAME = '8tracks'
483     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
484
485     def _real_extract(self, url):
486         mobj = re.match(self._VALID_URL, url)
487         if mobj is None:
488             raise ExtractorError(u'Invalid URL: %s' % url)
489         playlist_id = mobj.group('id')
490
491         webpage = self._download_webpage(url, playlist_id)
492
493         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
494         data = json.loads(json_like)
495
496         session = str(random.randint(0, 1000000000))
497         mix_id = data['id']
498         track_count = data['tracks_count']
499         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
500         next_url = first_url
501         res = []
502         for i in itertools.count():
503             api_json = self._download_webpage(next_url, playlist_id,
504                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
505                 errnote=u'Failed to download song information')
506             api_data = json.loads(api_json)
507             track_data = api_data[u'set']['track']
508             info = {
509                 'id': track_data['id'],
510                 'url': track_data['track_file_stream_url'],
511                 'title': track_data['performer'] + u' - ' + track_data['name'],
512                 'raw_title': track_data['name'],
513                 'uploader_id': data['user']['login'],
514                 'ext': 'm4a',
515             }
516             res.append(info)
517             if api_data['set']['at_last_track']:
518                 break
519             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
520         return res
521
522 class KeekIE(InfoExtractor):
523     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
524     IE_NAME = u'keek'
525
526     def _real_extract(self, url):
527         m = re.match(self._VALID_URL, url)
528         video_id = m.group('videoID')
529
530         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
531         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
532         webpage = self._download_webpage(url, video_id)
533
534         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
535             webpage, u'title')
536
537         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
538             webpage, u'uploader', fatal=False)
539
540         info = {
541                 'id': video_id,
542                 'url': video_url,
543                 'ext': 'mp4',
544                 'title': video_title,
545                 'thumbnail': thumbnail,
546                 'uploader': uploader
547         }
548         return [info]
549
550
551 class MySpassIE(InfoExtractor):
552     _VALID_URL = r'http://www.myspass.de/.*'
553
554     def _real_extract(self, url):
555         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
556
557         # video id is the last path element of the URL
558         # usually there is a trailing slash, so also try the second but last
559         url_path = compat_urllib_parse_urlparse(url).path
560         url_parent_path, video_id = os.path.split(url_path)
561         if not video_id:
562             _, video_id = os.path.split(url_parent_path)
563
564         # get metadata
565         metadata_url = META_DATA_URL_TEMPLATE % video_id
566         metadata_text = self._download_webpage(metadata_url, video_id)
567         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
568
569         # extract values from metadata
570         url_flv_el = metadata.find('url_flv')
571         if url_flv_el is None:
572             raise ExtractorError(u'Unable to extract download url')
573         video_url = url_flv_el.text
574         extension = os.path.splitext(video_url)[1][1:]
575         title_el = metadata.find('title')
576         if title_el is None:
577             raise ExtractorError(u'Unable to extract title')
578         title = title_el.text
579         format_id_el = metadata.find('format_id')
580         if format_id_el is None:
581             format = ext
582         else:
583             format = format_id_el.text
584         description_el = metadata.find('description')
585         if description_el is not None:
586             description = description_el.text
587         else:
588             description = None
589         imagePreview_el = metadata.find('imagePreview')
590         if imagePreview_el is not None:
591             thumbnail = imagePreview_el.text
592         else:
593             thumbnail = None
594         info = {
595             'id': video_id,
596             'url': video_url,
597             'title': title,
598             'ext': extension,
599             'format': format,
600             'thumbnail': thumbnail,
601             'description': description
602         }
603         return [info]
604
605 class SpiegelIE(InfoExtractor):
606     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
607
608     def _real_extract(self, url):
609         m = re.match(self._VALID_URL, url)
610         video_id = m.group('videoID')
611
612         webpage = self._download_webpage(url, video_id)
613
614         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
615             webpage, u'title')
616
617         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
618         xml_code = self._download_webpage(xml_url, video_id,
619                     note=u'Downloading XML', errnote=u'Failed to download XML')
620
621         idoc = xml.etree.ElementTree.fromstring(xml_code)
622         last_type = idoc[-1]
623         filename = last_type.findall('./filename')[0].text
624         duration = float(last_type.findall('./duration')[0].text)
625
626         video_url = 'http://video2.spiegel.de/flash/' + filename
627         video_ext = filename.rpartition('.')[2]
628         info = {
629             'id': video_id,
630             'url': video_url,
631             'ext': video_ext,
632             'title': video_title,
633             'duration': duration,
634         }
635         return [info]
636
637 class LiveLeakIE(InfoExtractor):
638
639     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
640     IE_NAME = u'liveleak'
641
642     def _real_extract(self, url):
643         mobj = re.match(self._VALID_URL, url)
644         if mobj is None:
645             raise ExtractorError(u'Invalid URL: %s' % url)
646
647         video_id = mobj.group('video_id')
648
649         webpage = self._download_webpage(url, video_id)
650
651         video_url = self._search_regex(r'file: "(.*?)",',
652             webpage, u'video URL')
653
654         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
655             webpage, u'title').replace('LiveLeak.com -', '').strip()
656
657         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
658             webpage, u'description', fatal=False)
659
660         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
661             webpage, u'uploader', fatal=False)
662
663         info = {
664             'id':  video_id,
665             'url': video_url,
666             'ext': 'mp4',
667             'title': video_title,
668             'description': video_description,
669             'uploader': video_uploader
670         }
671
672         return [info]
673
674
675
676 class TumblrIE(InfoExtractor):
677     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
678
679     def _real_extract(self, url):
680         m_url = re.match(self._VALID_URL, url)
681         video_id = m_url.group('id')
682         blog = m_url.group('blog_name')
683
684         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
685         webpage = self._download_webpage(url, video_id)
686
687         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
688         video = re.search(re_video, webpage)
689         if video is None:
690            raise ExtractorError(u'Unable to extract video')
691         video_url = video.group('video_url')
692         ext = video.group('ext')
693
694         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
695             webpage, u'thumbnail', fatal=False)  # We pick the first poster
696         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
697
698         # The only place where you can get a title, it's not complete,
699         # but searching in other places doesn't work for all videos
700         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
701             webpage, u'title', flags=re.DOTALL)
702
703         return [{'id': video_id,
704                  'url': video_url,
705                  'title': video_title,
706                  'thumbnail': video_thumbnail,
707                  'ext': ext
708                  }]
709
710 class BandcampIE(InfoExtractor):
711     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
712
713     def _real_extract(self, url):
714         mobj = re.match(self._VALID_URL, url)
715         title = mobj.group('title')
716         webpage = self._download_webpage(url, title)
717         # We get the link to the free download page
718         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
719         if m_download is None:
720             raise ExtractorError(u'No free songs found')
721
722         download_link = m_download.group(1)
723         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
724                        webpage, re.MULTILINE|re.DOTALL).group('id')
725
726         download_webpage = self._download_webpage(download_link, id,
727                                                   'Downloading free downloads page')
728         # We get the dictionary of the track from some javascrip code
729         info = re.search(r'items: (.*?),$',
730                          download_webpage, re.MULTILINE).group(1)
731         info = json.loads(info)[0]
732         # We pick mp3-320 for now, until format selection can be easily implemented.
733         mp3_info = info[u'downloads'][u'mp3-320']
734         # If we try to use this url it says the link has expired
735         initial_url = mp3_info[u'url']
736         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
737         m_url = re.match(re_url, initial_url)
738         #We build the url we will use to get the final track url
739         # This url is build in Bandcamp in the script download_bunde_*.js
740         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
741         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
742         # If we could correctly generate the .rand field the url would be
743         #in the "download_url" key
744         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
745
746         track_info = {'id':id,
747                       'title' : info[u'title'],
748                       'ext' :   'mp3',
749                       'url' :   final_url,
750                       'thumbnail' : info[u'thumb_url'],
751                       'uploader' :  info[u'artist']
752                       }
753
754         return [track_info]
755
756 class RedTubeIE(InfoExtractor):
757     """Information Extractor for redtube"""
758     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
759
760     def _real_extract(self,url):
761         mobj = re.match(self._VALID_URL, url)
762         if mobj is None:
763             raise ExtractorError(u'Invalid URL: %s' % url)
764
765         video_id = mobj.group('id')
766         video_extension = 'mp4'        
767         webpage = self._download_webpage(url, video_id)
768
769         self.report_extraction(video_id)
770
771         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
772             webpage, u'video URL')
773
774         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
775             webpage, u'title')
776
777         return [{
778             'id':       video_id,
779             'url':      video_url,
780             'ext':      video_extension,
781             'title':    video_title,
782         }]
783         
784 class InaIE(InfoExtractor):
785     """Information Extractor for Ina.fr"""
786     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
787
788     def _real_extract(self,url):
789         mobj = re.match(self._VALID_URL, url)
790
791         video_id = mobj.group('id')
792         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
793         video_extension = 'mp4'
794         webpage = self._download_webpage(mrss_url, video_id)
795
796         self.report_extraction(video_id)
797
798         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
799             webpage, u'video URL')
800
801         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
802             webpage, u'title')
803
804         return [{
805             'id':       video_id,
806             'url':      video_url,
807             'ext':      video_extension,
808             'title':    video_title,
809         }]
810
811 class HowcastIE(InfoExtractor):
812     """Information Extractor for Howcast.com"""
813     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
814
815     def _real_extract(self, url):
816         mobj = re.match(self._VALID_URL, url)
817
818         video_id = mobj.group('id')
819         webpage_url = 'http://www.howcast.com/videos/' + video_id
820         webpage = self._download_webpage(webpage_url, video_id)
821
822         self.report_extraction(video_id)
823
824         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
825             webpage, u'video URL')
826
827         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
828             webpage, u'title')
829
830         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
831             webpage, u'description', fatal=False)
832
833         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
834             webpage, u'thumbnail', fatal=False)
835
836         return [{
837             'id':       video_id,
838             'url':      video_url,
839             'ext':      'mp4',
840             'title':    video_title,
841             'description': video_description,
842             'thumbnail': thumbnail,
843         }]
844
845 class VineIE(InfoExtractor):
846     """Information Extractor for Vine.co"""
847     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
848
849     def _real_extract(self, url):
850         mobj = re.match(self._VALID_URL, url)
851
852         video_id = mobj.group('id')
853         webpage_url = 'https://vine.co/v/' + video_id
854         webpage = self._download_webpage(webpage_url, video_id)
855
856         self.report_extraction(video_id)
857
858         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
859             webpage, u'video URL')
860
861         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
862             webpage, u'title')
863
864         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
865             webpage, u'thumbnail', fatal=False)
866
867         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
868             webpage, u'uploader', fatal=False, flags=re.DOTALL)
869
870         return [{
871             'id':        video_id,
872             'url':       video_url,
873             'ext':       'mp4',
874             'title':     video_title,
875             'thumbnail': thumbnail,
876             'uploader':  uploader,
877         }]
878
879 class FlickrIE(InfoExtractor):
880     """Information Extractor for Flickr videos"""
881     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
882
883     def _real_extract(self, url):
884         mobj = re.match(self._VALID_URL, url)
885
886         video_id = mobj.group('id')
887         video_uploader_id = mobj.group('uploader_id')
888         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
889         webpage = self._download_webpage(webpage_url, video_id)
890
891         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
892
893         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
894         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
895
896         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
897             first_xml, u'node_id')
898
899         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
900         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
901
902         self.report_extraction(video_id)
903
904         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
905         if mobj is None:
906             raise ExtractorError(u'Unable to extract video url')
907         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
908
909         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
910             webpage, u'video title')
911
912         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
913             webpage, u'description', fatal=False)
914
915         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
916             webpage, u'thumbnail', fatal=False)
917
918         return [{
919             'id':          video_id,
920             'url':         video_url,
921             'ext':         'mp4',
922             'title':       video_title,
923             'description': video_description,
924             'thumbnail':   thumbnail,
925             'uploader_id': video_uploader_id,
926         }]
927
928 class TeamcocoIE(InfoExtractor):
929     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
930
931     def _real_extract(self, url):
932         mobj = re.match(self._VALID_URL, url)
933         if mobj is None:
934             raise ExtractorError(u'Invalid URL: %s' % url)
935         url_title = mobj.group('url_title')
936         webpage = self._download_webpage(url, url_title)
937
938         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
939             webpage, u'video id')
940
941         self.report_extraction(video_id)
942
943         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
944             webpage, u'title')
945
946         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
947             webpage, u'thumbnail', fatal=False)
948
949         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
950             webpage, u'description', fatal=False)
951
952         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
953         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
954
955         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
956             data, u'video URL')
957
958         return [{
959             'id':          video_id,
960             'url':         video_url,
961             'ext':         'mp4',
962             'title':       video_title,
963             'thumbnail':   thumbnail,
964             'description': video_description,
965         }]
966
967 class XHamsterIE(InfoExtractor):
968     """Information Extractor for xHamster"""
969     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
970
971     def _real_extract(self,url):
972         mobj = re.match(self._VALID_URL, url)
973
974         video_id = mobj.group('id')
975         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
976         webpage = self._download_webpage(mrss_url, video_id)
977
978         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
979         if mobj is None:
980             raise ExtractorError(u'Unable to extract media URL')
981         if len(mobj.group('server')) == 0:
982             video_url = compat_urllib_parse.unquote(mobj.group('file'))
983         else:
984             video_url = mobj.group('server')+'/key='+mobj.group('file')
985         video_extension = video_url.split('.')[-1]
986
987         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
988             webpage, u'title')
989
990         # Can't see the description anywhere in the UI
991         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
992         #     webpage, u'description', fatal=False)
993         # if video_description: video_description = unescapeHTML(video_description)
994
995         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
996         if mobj:
997             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
998         else:
999             video_upload_date = None
1000             self._downloader.report_warning(u'Unable to extract upload date')
1001
1002         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1003             webpage, u'uploader id', default=u'anonymous')
1004
1005         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1006             webpage, u'thumbnail', fatal=False)
1007
1008         return [{
1009             'id':       video_id,
1010             'url':      video_url,
1011             'ext':      video_extension,
1012             'title':    video_title,
1013             # 'description': video_description,
1014             'upload_date': video_upload_date,
1015             'uploader_id': video_uploader_id,
1016             'thumbnail': video_thumbnail
1017         }]
1018
1019 class HypemIE(InfoExtractor):
1020     """Information Extractor for hypem"""
1021     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1022
1023     def _real_extract(self, url):
1024         mobj = re.match(self._VALID_URL, url)
1025         if mobj is None:
1026             raise ExtractorError(u'Invalid URL: %s' % url)
1027         track_id = mobj.group(1)
1028
1029         data = { 'ax': 1, 'ts': time.time() }
1030         data_encoded = compat_urllib_parse.urlencode(data)
1031         complete_url = url + "?" + data_encoded
1032         request = compat_urllib_request.Request(complete_url)
1033         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1034         cookie = urlh.headers.get('Set-Cookie', '')
1035
1036         self.report_extraction(track_id)
1037
1038         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1039             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1040         try:
1041             track_list = json.loads(html_tracks)
1042             track = track_list[u'tracks'][0]
1043         except ValueError:
1044             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1045
1046         key = track[u"key"]
1047         track_id = track[u"id"]
1048         artist = track[u"artist"]
1049         title = track[u"song"]
1050
1051         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1052         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1053         request.add_header('cookie', cookie)
1054         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1055         try:
1056             song_data = json.loads(song_data_json)
1057         except ValueError:
1058             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1059         final_url = song_data[u"url"]
1060
1061         return [{
1062             'id':       track_id,
1063             'url':      final_url,
1064             'ext':      "mp3",
1065             'title':    title,
1066             'artist':   artist,
1067         }]
1068
1069 class Vbox7IE(InfoExtractor):
1070     """Information Extractor for Vbox7"""
1071     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1072
1073     def _real_extract(self,url):
1074         mobj = re.match(self._VALID_URL, url)
1075         if mobj is None:
1076             raise ExtractorError(u'Invalid URL: %s' % url)
1077         video_id = mobj.group(1)
1078
1079         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1080         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1081         redirect_url = urlh.geturl() + new_location
1082         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1083
1084         title = self._html_search_regex(r'<title>(.*)</title>',
1085             webpage, u'title').split('/')[0].strip()
1086
1087         ext = "flv"
1088         info_url = "http://vbox7.com/play/magare.do"
1089         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1090         info_request = compat_urllib_request.Request(info_url, data)
1091         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1092         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1093         if info_response is None:
1094             raise ExtractorError(u'Unable to extract the media url')
1095         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1096
1097         return [{
1098             'id':        video_id,
1099             'url':       final_url,
1100             'ext':       ext,
1101             'title':     title,
1102             'thumbnail': thumbnail_url,
1103         }]
1104
1105
1106 def gen_extractors():
1107     """ Return a list of an instance of every supported extractor.
1108     The order does matter; the first extractor matched is the one handling the URL.
1109     """
1110     return [
1111         YoutubePlaylistIE(),
1112         YoutubeChannelIE(),
1113         YoutubeUserIE(),
1114         YoutubeSearchIE(),
1115         YoutubeIE(),
1116         MetacafeIE(),
1117         DailymotionIE(),
1118         GoogleSearchIE(),
1119         PhotobucketIE(),
1120         YahooIE(),
1121         YahooSearchIE(),
1122         DepositFilesIE(),
1123         FacebookIE(),
1124         BlipTVIE(),
1125         BlipTVUserIE(),
1126         VimeoIE(),
1127         MyVideoIE(),
1128         ComedyCentralIE(),
1129         EscapistIE(),
1130         CollegeHumorIE(),
1131         XVideosIE(),
1132         SoundcloudSetIE(),
1133         SoundcloudIE(),
1134         InfoQIE(),
1135         MixcloudIE(),
1136         StanfordOpenClassroomIE(),
1137         MTVIE(),
1138         YoukuIE(),
1139         XNXXIE(),
1140         YouJizzIE(),
1141         PornotubeIE(),
1142         YouPornIE(),
1143         GooglePlusIE(),
1144         ArteTvIE(),
1145         NBAIE(),
1146         WorldStarHipHopIE(),
1147         JustinTVIE(),
1148         FunnyOrDieIE(),
1149         SteamIE(),
1150         UstreamIE(),
1151         RBMARadioIE(),
1152         EightTracksIE(),
1153         KeekIE(),
1154         TEDIE(),
1155         MySpassIE(),
1156         SpiegelIE(),
1157         LiveLeakIE(),
1158         ARDIE(),
1159         ZDFIE(),
1160         TumblrIE(),
1161         BandcampIE(),
1162         RedTubeIE(),
1163         InaIE(),
1164         HowcastIE(),
1165         VineIE(),
1166         FlickrIE(),
1167         TeamcocoIE(),
1168         XHamsterIE(),
1169         HypemIE(),
1170         Vbox7IE(),
1171         GametrailersIE(),
1172         StatigramIE(),
1173         GenericIE()
1174     ]
1175
1176 def get_info_extractor(ie_name):
1177     """Returns the info extractor class with the given ie_name"""
1178     return globals()[ie_name+'IE']