[justin.tv] move into own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.funnyordie import FunnyOrDieIE
31 from .extractor.gametrailers import GametrailersIE
32 from .extractor.generic import GenericIE
33 from .extractor.googleplus import GooglePlusIE
34 from .extractor.googlesearch import GoogleSearchIE
35 from .extractor.infoq import InfoQIE
36 from .extractor.justintv import JustinTVIE
37 from .extractor.metacafe import MetacafeIE
38 from .extractor.mixcloud import MixcloudIE
39 from .extractor.mtv import MTVIE
40 from .extractor.myvideo import MyVideoIE
41 from .extractor.nba import NBAIE
42 from .extractor.statigram import StatigramIE
43 from .extractor.photobucket import PhotobucketIE
44 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
45 from .extractor.stanfordoc import StanfordOpenClassroomIE
46 from .extractor.steam import SteamIE
47 from .extractor.ted import TEDIE
48 from .extractor.vimeo import VimeoIE
49 from .extractor.worldstarhiphop import WorldStarHipHopIE
50 from .extractor.xnxx import XNXXIE
51 from .extractor.xvideos import XVideosIE
52 from .extractor.yahoo import YahooIE, YahooSearchIE
53 from .extractor.youku import YoukuIE
54 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
55 from .extractor.zdf import ZDFIE
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72 class UstreamIE(InfoExtractor):
73     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
74     IE_NAME = u'ustream'
75
76     def _real_extract(self, url):
77         m = re.match(self._VALID_URL, url)
78         video_id = m.group('videoID')
79
80         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
81         webpage = self._download_webpage(url, video_id)
82
83         self.report_extraction(video_id)
84
85         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
86             webpage, u'title')
87
88         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
89             webpage, u'uploader', fatal=False, flags=re.DOTALL)
90
91         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
92             webpage, u'thumbnail', fatal=False)
93
94         info = {
95                 'id': video_id,
96                 'url': video_url,
97                 'ext': 'flv',
98                 'title': video_title,
99                 'uploader': uploader,
100                 'thumbnail': thumbnail,
101                }
102         return info
103
104
105 class RBMARadioIE(InfoExtractor):
106     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
107
108     def _real_extract(self, url):
109         m = re.match(self._VALID_URL, url)
110         video_id = m.group('videoID')
111
112         webpage = self._download_webpage(url, video_id)
113
114         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
115             webpage, u'json data', flags=re.MULTILINE)
116
117         try:
118             data = json.loads(json_data)
119         except ValueError as e:
120             raise ExtractorError(u'Invalid JSON: ' + str(e))
121
122         video_url = data['akamai_url'] + '&cbr=256'
123         url_parts = compat_urllib_parse_urlparse(video_url)
124         video_ext = url_parts.path.rpartition('.')[2]
125         info = {
126                 'id': video_id,
127                 'url': video_url,
128                 'ext': video_ext,
129                 'title': data['title'],
130                 'description': data.get('teaser_text'),
131                 'location': data.get('country_of_origin'),
132                 'uploader': data.get('host', {}).get('name'),
133                 'uploader_id': data.get('host', {}).get('slug'),
134                 'thumbnail': data.get('image', {}).get('large_url_2x'),
135                 'duration': data.get('duration'),
136         }
137         return [info]
138
139
140 class YouPornIE(InfoExtractor):
141     """Information extractor for youporn.com."""
142     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
143
144     def _print_formats(self, formats):
145         """Print all available formats"""
146         print(u'Available formats:')
147         print(u'ext\t\tformat')
148         print(u'---------------------------------')
149         for format in formats:
150             print(u'%s\t\t%s'  % (format['ext'], format['format']))
151
152     def _specific(self, req_format, formats):
153         for x in formats:
154             if(x["format"]==req_format):
155                 return x
156         return None
157
158     def _real_extract(self, url):
159         mobj = re.match(self._VALID_URL, url)
160         if mobj is None:
161             raise ExtractorError(u'Invalid URL: %s' % url)
162         video_id = mobj.group('videoid')
163
164         req = compat_urllib_request.Request(url)
165         req.add_header('Cookie', 'age_verified=1')
166         webpage = self._download_webpage(req, video_id)
167
168         # Get JSON parameters
169         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
170         try:
171             params = json.loads(json_params)
172         except:
173             raise ExtractorError(u'Invalid JSON')
174
175         self.report_extraction(video_id)
176         try:
177             video_title = params['title']
178             upload_date = unified_strdate(params['release_date_f'])
179             video_description = params['description']
180             video_uploader = params['submitted_by']
181             thumbnail = params['thumbnails'][0]['image']
182         except KeyError:
183             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
184
185         # Get all of the formats available
186         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
187         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
188             webpage, u'download list').strip()
189
190         # Get all of the links from the page
191         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
192         links = re.findall(LINK_RE, download_list_html)
193         if(len(links) == 0):
194             raise ExtractorError(u'ERROR: no known formats available for video')
195
196         self.to_screen(u'Links found: %d' % len(links))
197
198         formats = []
199         for link in links:
200
201             # A link looks like this:
202             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
203             # A path looks like this:
204             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
205             video_url = unescapeHTML( link )
206             path = compat_urllib_parse_urlparse( video_url ).path
207             extension = os.path.splitext( path )[1][1:]
208             format = path.split('/')[4].split('_')[:2]
209             size = format[0]
210             bitrate = format[1]
211             format = "-".join( format )
212             # title = u'%s-%s-%s' % (video_title, size, bitrate)
213
214             formats.append({
215                 'id': video_id,
216                 'url': video_url,
217                 'uploader': video_uploader,
218                 'upload_date': upload_date,
219                 'title': video_title,
220                 'ext': extension,
221                 'format': format,
222                 'thumbnail': thumbnail,
223                 'description': video_description
224             })
225
226         if self._downloader.params.get('listformats', None):
227             self._print_formats(formats)
228             return
229
230         req_format = self._downloader.params.get('format', None)
231         self.to_screen(u'Format: %s' % req_format)
232
233         if req_format is None or req_format == 'best':
234             return [formats[0]]
235         elif req_format == 'worst':
236             return [formats[-1]]
237         elif req_format in ('-1', 'all'):
238             return formats
239         else:
240             format = self._specific( req_format, formats )
241             if result is None:
242                 raise ExtractorError(u'Requested format not available')
243             return [format]
244
245
246
247 class PornotubeIE(InfoExtractor):
248     """Information extractor for pornotube.com."""
249     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
250
251     def _real_extract(self, url):
252         mobj = re.match(self._VALID_URL, url)
253         if mobj is None:
254             raise ExtractorError(u'Invalid URL: %s' % url)
255
256         video_id = mobj.group('videoid')
257         video_title = mobj.group('title')
258
259         # Get webpage content
260         webpage = self._download_webpage(url, video_id)
261
262         # Get the video URL
263         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
264         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
265         video_url = compat_urllib_parse.unquote(video_url)
266
267         #Get the uploaded date
268         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
269         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
270         if upload_date: upload_date = unified_strdate(upload_date)
271
272         info = {'id': video_id,
273                 'url': video_url,
274                 'uploader': None,
275                 'upload_date': upload_date,
276                 'title': video_title,
277                 'ext': 'flv',
278                 'format': 'flv'}
279
280         return [info]
281
282 class YouJizzIE(InfoExtractor):
283     """Information extractor for youjizz.com."""
284     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
285
286     def _real_extract(self, url):
287         mobj = re.match(self._VALID_URL, url)
288         if mobj is None:
289             raise ExtractorError(u'Invalid URL: %s' % url)
290
291         video_id = mobj.group('videoid')
292
293         # Get webpage content
294         webpage = self._download_webpage(url, video_id)
295
296         # Get the video title
297         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
298             webpage, u'title').strip()
299
300         # Get the embed page
301         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
302         if result is None:
303             raise ExtractorError(u'ERROR: unable to extract embed page')
304
305         embed_page_url = result.group(0).strip()
306         video_id = result.group('videoid')
307
308         webpage = self._download_webpage(embed_page_url, video_id)
309
310         # Get the video URL
311         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
312             webpage, u'video URL')
313
314         info = {'id': video_id,
315                 'url': video_url,
316                 'title': video_title,
317                 'ext': 'flv',
318                 'format': 'flv',
319                 'player_url': embed_page_url}
320
321         return [info]
322
323 class EightTracksIE(InfoExtractor):
324     IE_NAME = '8tracks'
325     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
326
327     def _real_extract(self, url):
328         mobj = re.match(self._VALID_URL, url)
329         if mobj is None:
330             raise ExtractorError(u'Invalid URL: %s' % url)
331         playlist_id = mobj.group('id')
332
333         webpage = self._download_webpage(url, playlist_id)
334
335         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
336         data = json.loads(json_like)
337
338         session = str(random.randint(0, 1000000000))
339         mix_id = data['id']
340         track_count = data['tracks_count']
341         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
342         next_url = first_url
343         res = []
344         for i in itertools.count():
345             api_json = self._download_webpage(next_url, playlist_id,
346                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
347                 errnote=u'Failed to download song information')
348             api_data = json.loads(api_json)
349             track_data = api_data[u'set']['track']
350             info = {
351                 'id': track_data['id'],
352                 'url': track_data['track_file_stream_url'],
353                 'title': track_data['performer'] + u' - ' + track_data['name'],
354                 'raw_title': track_data['name'],
355                 'uploader_id': data['user']['login'],
356                 'ext': 'm4a',
357             }
358             res.append(info)
359             if api_data['set']['at_last_track']:
360                 break
361             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
362         return res
363
364 class KeekIE(InfoExtractor):
365     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
366     IE_NAME = u'keek'
367
368     def _real_extract(self, url):
369         m = re.match(self._VALID_URL, url)
370         video_id = m.group('videoID')
371
372         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
373         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
374         webpage = self._download_webpage(url, video_id)
375
376         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
377             webpage, u'title')
378
379         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
380             webpage, u'uploader', fatal=False)
381
382         info = {
383                 'id': video_id,
384                 'url': video_url,
385                 'ext': 'mp4',
386                 'title': video_title,
387                 'thumbnail': thumbnail,
388                 'uploader': uploader
389         }
390         return [info]
391
392
393 class MySpassIE(InfoExtractor):
394     _VALID_URL = r'http://www.myspass.de/.*'
395
396     def _real_extract(self, url):
397         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
398
399         # video id is the last path element of the URL
400         # usually there is a trailing slash, so also try the second but last
401         url_path = compat_urllib_parse_urlparse(url).path
402         url_parent_path, video_id = os.path.split(url_path)
403         if not video_id:
404             _, video_id = os.path.split(url_parent_path)
405
406         # get metadata
407         metadata_url = META_DATA_URL_TEMPLATE % video_id
408         metadata_text = self._download_webpage(metadata_url, video_id)
409         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
410
411         # extract values from metadata
412         url_flv_el = metadata.find('url_flv')
413         if url_flv_el is None:
414             raise ExtractorError(u'Unable to extract download url')
415         video_url = url_flv_el.text
416         extension = os.path.splitext(video_url)[1][1:]
417         title_el = metadata.find('title')
418         if title_el is None:
419             raise ExtractorError(u'Unable to extract title')
420         title = title_el.text
421         format_id_el = metadata.find('format_id')
422         if format_id_el is None:
423             format = ext
424         else:
425             format = format_id_el.text
426         description_el = metadata.find('description')
427         if description_el is not None:
428             description = description_el.text
429         else:
430             description = None
431         imagePreview_el = metadata.find('imagePreview')
432         if imagePreview_el is not None:
433             thumbnail = imagePreview_el.text
434         else:
435             thumbnail = None
436         info = {
437             'id': video_id,
438             'url': video_url,
439             'title': title,
440             'ext': extension,
441             'format': format,
442             'thumbnail': thumbnail,
443             'description': description
444         }
445         return [info]
446
447 class SpiegelIE(InfoExtractor):
448     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
449
450     def _real_extract(self, url):
451         m = re.match(self._VALID_URL, url)
452         video_id = m.group('videoID')
453
454         webpage = self._download_webpage(url, video_id)
455
456         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
457             webpage, u'title')
458
459         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
460         xml_code = self._download_webpage(xml_url, video_id,
461                     note=u'Downloading XML', errnote=u'Failed to download XML')
462
463         idoc = xml.etree.ElementTree.fromstring(xml_code)
464         last_type = idoc[-1]
465         filename = last_type.findall('./filename')[0].text
466         duration = float(last_type.findall('./duration')[0].text)
467
468         video_url = 'http://video2.spiegel.de/flash/' + filename
469         video_ext = filename.rpartition('.')[2]
470         info = {
471             'id': video_id,
472             'url': video_url,
473             'ext': video_ext,
474             'title': video_title,
475             'duration': duration,
476         }
477         return [info]
478
479 class LiveLeakIE(InfoExtractor):
480
481     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
482     IE_NAME = u'liveleak'
483
484     def _real_extract(self, url):
485         mobj = re.match(self._VALID_URL, url)
486         if mobj is None:
487             raise ExtractorError(u'Invalid URL: %s' % url)
488
489         video_id = mobj.group('video_id')
490
491         webpage = self._download_webpage(url, video_id)
492
493         video_url = self._search_regex(r'file: "(.*?)",',
494             webpage, u'video URL')
495
496         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
497             webpage, u'title').replace('LiveLeak.com -', '').strip()
498
499         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
500             webpage, u'description', fatal=False)
501
502         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
503             webpage, u'uploader', fatal=False)
504
505         info = {
506             'id':  video_id,
507             'url': video_url,
508             'ext': 'mp4',
509             'title': video_title,
510             'description': video_description,
511             'uploader': video_uploader
512         }
513
514         return [info]
515
516
517
518 class TumblrIE(InfoExtractor):
519     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
520
521     def _real_extract(self, url):
522         m_url = re.match(self._VALID_URL, url)
523         video_id = m_url.group('id')
524         blog = m_url.group('blog_name')
525
526         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
527         webpage = self._download_webpage(url, video_id)
528
529         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
530         video = re.search(re_video, webpage)
531         if video is None:
532            raise ExtractorError(u'Unable to extract video')
533         video_url = video.group('video_url')
534         ext = video.group('ext')
535
536         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
537             webpage, u'thumbnail', fatal=False)  # We pick the first poster
538         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
539
540         # The only place where you can get a title, it's not complete,
541         # but searching in other places doesn't work for all videos
542         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
543             webpage, u'title', flags=re.DOTALL)
544
545         return [{'id': video_id,
546                  'url': video_url,
547                  'title': video_title,
548                  'thumbnail': video_thumbnail,
549                  'ext': ext
550                  }]
551
552 class BandcampIE(InfoExtractor):
553     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
554
555     def _real_extract(self, url):
556         mobj = re.match(self._VALID_URL, url)
557         title = mobj.group('title')
558         webpage = self._download_webpage(url, title)
559         # We get the link to the free download page
560         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
561         if m_download is None:
562             raise ExtractorError(u'No free songs found')
563
564         download_link = m_download.group(1)
565         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
566                        webpage, re.MULTILINE|re.DOTALL).group('id')
567
568         download_webpage = self._download_webpage(download_link, id,
569                                                   'Downloading free downloads page')
570         # We get the dictionary of the track from some javascrip code
571         info = re.search(r'items: (.*?),$',
572                          download_webpage, re.MULTILINE).group(1)
573         info = json.loads(info)[0]
574         # We pick mp3-320 for now, until format selection can be easily implemented.
575         mp3_info = info[u'downloads'][u'mp3-320']
576         # If we try to use this url it says the link has expired
577         initial_url = mp3_info[u'url']
578         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
579         m_url = re.match(re_url, initial_url)
580         #We build the url we will use to get the final track url
581         # This url is build in Bandcamp in the script download_bunde_*.js
582         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
583         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
584         # If we could correctly generate the .rand field the url would be
585         #in the "download_url" key
586         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
587
588         track_info = {'id':id,
589                       'title' : info[u'title'],
590                       'ext' :   'mp3',
591                       'url' :   final_url,
592                       'thumbnail' : info[u'thumb_url'],
593                       'uploader' :  info[u'artist']
594                       }
595
596         return [track_info]
597
598 class RedTubeIE(InfoExtractor):
599     """Information Extractor for redtube"""
600     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
601
602     def _real_extract(self,url):
603         mobj = re.match(self._VALID_URL, url)
604         if mobj is None:
605             raise ExtractorError(u'Invalid URL: %s' % url)
606
607         video_id = mobj.group('id')
608         video_extension = 'mp4'        
609         webpage = self._download_webpage(url, video_id)
610
611         self.report_extraction(video_id)
612
613         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
614             webpage, u'video URL')
615
616         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
617             webpage, u'title')
618
619         return [{
620             'id':       video_id,
621             'url':      video_url,
622             'ext':      video_extension,
623             'title':    video_title,
624         }]
625         
626 class InaIE(InfoExtractor):
627     """Information Extractor for Ina.fr"""
628     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
629
630     def _real_extract(self,url):
631         mobj = re.match(self._VALID_URL, url)
632
633         video_id = mobj.group('id')
634         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
635         video_extension = 'mp4'
636         webpage = self._download_webpage(mrss_url, video_id)
637
638         self.report_extraction(video_id)
639
640         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
641             webpage, u'video URL')
642
643         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
644             webpage, u'title')
645
646         return [{
647             'id':       video_id,
648             'url':      video_url,
649             'ext':      video_extension,
650             'title':    video_title,
651         }]
652
653 class HowcastIE(InfoExtractor):
654     """Information Extractor for Howcast.com"""
655     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
656
657     def _real_extract(self, url):
658         mobj = re.match(self._VALID_URL, url)
659
660         video_id = mobj.group('id')
661         webpage_url = 'http://www.howcast.com/videos/' + video_id
662         webpage = self._download_webpage(webpage_url, video_id)
663
664         self.report_extraction(video_id)
665
666         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
667             webpage, u'video URL')
668
669         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
670             webpage, u'title')
671
672         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
673             webpage, u'description', fatal=False)
674
675         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
676             webpage, u'thumbnail', fatal=False)
677
678         return [{
679             'id':       video_id,
680             'url':      video_url,
681             'ext':      'mp4',
682             'title':    video_title,
683             'description': video_description,
684             'thumbnail': thumbnail,
685         }]
686
687 class VineIE(InfoExtractor):
688     """Information Extractor for Vine.co"""
689     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
690
691     def _real_extract(self, url):
692         mobj = re.match(self._VALID_URL, url)
693
694         video_id = mobj.group('id')
695         webpage_url = 'https://vine.co/v/' + video_id
696         webpage = self._download_webpage(webpage_url, video_id)
697
698         self.report_extraction(video_id)
699
700         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
701             webpage, u'video URL')
702
703         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
704             webpage, u'title')
705
706         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
707             webpage, u'thumbnail', fatal=False)
708
709         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
710             webpage, u'uploader', fatal=False, flags=re.DOTALL)
711
712         return [{
713             'id':        video_id,
714             'url':       video_url,
715             'ext':       'mp4',
716             'title':     video_title,
717             'thumbnail': thumbnail,
718             'uploader':  uploader,
719         }]
720
721 class FlickrIE(InfoExtractor):
722     """Information Extractor for Flickr videos"""
723     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
724
725     def _real_extract(self, url):
726         mobj = re.match(self._VALID_URL, url)
727
728         video_id = mobj.group('id')
729         video_uploader_id = mobj.group('uploader_id')
730         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
731         webpage = self._download_webpage(webpage_url, video_id)
732
733         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
734
735         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
736         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
737
738         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
739             first_xml, u'node_id')
740
741         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
742         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
743
744         self.report_extraction(video_id)
745
746         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
747         if mobj is None:
748             raise ExtractorError(u'Unable to extract video url')
749         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
750
751         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
752             webpage, u'video title')
753
754         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
755             webpage, u'description', fatal=False)
756
757         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
758             webpage, u'thumbnail', fatal=False)
759
760         return [{
761             'id':          video_id,
762             'url':         video_url,
763             'ext':         'mp4',
764             'title':       video_title,
765             'description': video_description,
766             'thumbnail':   thumbnail,
767             'uploader_id': video_uploader_id,
768         }]
769
770 class TeamcocoIE(InfoExtractor):
771     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
772
773     def _real_extract(self, url):
774         mobj = re.match(self._VALID_URL, url)
775         if mobj is None:
776             raise ExtractorError(u'Invalid URL: %s' % url)
777         url_title = mobj.group('url_title')
778         webpage = self._download_webpage(url, url_title)
779
780         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
781             webpage, u'video id')
782
783         self.report_extraction(video_id)
784
785         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
786             webpage, u'title')
787
788         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
789             webpage, u'thumbnail', fatal=False)
790
791         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
792             webpage, u'description', fatal=False)
793
794         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
795         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
796
797         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
798             data, u'video URL')
799
800         return [{
801             'id':          video_id,
802             'url':         video_url,
803             'ext':         'mp4',
804             'title':       video_title,
805             'thumbnail':   thumbnail,
806             'description': video_description,
807         }]
808
809 class XHamsterIE(InfoExtractor):
810     """Information Extractor for xHamster"""
811     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
812
813     def _real_extract(self,url):
814         mobj = re.match(self._VALID_URL, url)
815
816         video_id = mobj.group('id')
817         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
818         webpage = self._download_webpage(mrss_url, video_id)
819
820         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
821         if mobj is None:
822             raise ExtractorError(u'Unable to extract media URL')
823         if len(mobj.group('server')) == 0:
824             video_url = compat_urllib_parse.unquote(mobj.group('file'))
825         else:
826             video_url = mobj.group('server')+'/key='+mobj.group('file')
827         video_extension = video_url.split('.')[-1]
828
829         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
830             webpage, u'title')
831
832         # Can't see the description anywhere in the UI
833         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
834         #     webpage, u'description', fatal=False)
835         # if video_description: video_description = unescapeHTML(video_description)
836
837         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
838         if mobj:
839             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
840         else:
841             video_upload_date = None
842             self._downloader.report_warning(u'Unable to extract upload date')
843
844         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
845             webpage, u'uploader id', default=u'anonymous')
846
847         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
848             webpage, u'thumbnail', fatal=False)
849
850         return [{
851             'id':       video_id,
852             'url':      video_url,
853             'ext':      video_extension,
854             'title':    video_title,
855             # 'description': video_description,
856             'upload_date': video_upload_date,
857             'uploader_id': video_uploader_id,
858             'thumbnail': video_thumbnail
859         }]
860
861 class HypemIE(InfoExtractor):
862     """Information Extractor for hypem"""
863     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
864
865     def _real_extract(self, url):
866         mobj = re.match(self._VALID_URL, url)
867         if mobj is None:
868             raise ExtractorError(u'Invalid URL: %s' % url)
869         track_id = mobj.group(1)
870
871         data = { 'ax': 1, 'ts': time.time() }
872         data_encoded = compat_urllib_parse.urlencode(data)
873         complete_url = url + "?" + data_encoded
874         request = compat_urllib_request.Request(complete_url)
875         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
876         cookie = urlh.headers.get('Set-Cookie', '')
877
878         self.report_extraction(track_id)
879
880         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
881             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
882         try:
883             track_list = json.loads(html_tracks)
884             track = track_list[u'tracks'][0]
885         except ValueError:
886             raise ExtractorError(u'Hypemachine contained invalid JSON.')
887
888         key = track[u"key"]
889         track_id = track[u"id"]
890         artist = track[u"artist"]
891         title = track[u"song"]
892
893         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
894         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
895         request.add_header('cookie', cookie)
896         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
897         try:
898             song_data = json.loads(song_data_json)
899         except ValueError:
900             raise ExtractorError(u'Hypemachine contained invalid JSON.')
901         final_url = song_data[u"url"]
902
903         return [{
904             'id':       track_id,
905             'url':      final_url,
906             'ext':      "mp3",
907             'title':    title,
908             'artist':   artist,
909         }]
910
911 class Vbox7IE(InfoExtractor):
912     """Information Extractor for Vbox7"""
913     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
914
915     def _real_extract(self,url):
916         mobj = re.match(self._VALID_URL, url)
917         if mobj is None:
918             raise ExtractorError(u'Invalid URL: %s' % url)
919         video_id = mobj.group(1)
920
921         redirect_page, urlh = self._download_webpage_handle(url, video_id)
922         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
923         redirect_url = urlh.geturl() + new_location
924         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
925
926         title = self._html_search_regex(r'<title>(.*)</title>',
927             webpage, u'title').split('/')[0].strip()
928
929         ext = "flv"
930         info_url = "http://vbox7.com/play/magare.do"
931         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
932         info_request = compat_urllib_request.Request(info_url, data)
933         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
934         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
935         if info_response is None:
936             raise ExtractorError(u'Unable to extract the media url')
937         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
938
939         return [{
940             'id':        video_id,
941             'url':       final_url,
942             'ext':       ext,
943             'title':     title,
944             'thumbnail': thumbnail_url,
945         }]
946
947
948 def gen_extractors():
949     """ Return a list of an instance of every supported extractor.
950     The order does matter; the first extractor matched is the one handling the URL.
951     """
952     return [
953         YoutubePlaylistIE(),
954         YoutubeChannelIE(),
955         YoutubeUserIE(),
956         YoutubeSearchIE(),
957         YoutubeIE(),
958         MetacafeIE(),
959         DailymotionIE(),
960         GoogleSearchIE(),
961         PhotobucketIE(),
962         YahooIE(),
963         YahooSearchIE(),
964         DepositFilesIE(),
965         FacebookIE(),
966         BlipTVIE(),
967         BlipTVUserIE(),
968         VimeoIE(),
969         MyVideoIE(),
970         ComedyCentralIE(),
971         EscapistIE(),
972         CollegeHumorIE(),
973         XVideosIE(),
974         SoundcloudSetIE(),
975         SoundcloudIE(),
976         InfoQIE(),
977         MixcloudIE(),
978         StanfordOpenClassroomIE(),
979         MTVIE(),
980         YoukuIE(),
981         XNXXIE(),
982         YouJizzIE(),
983         PornotubeIE(),
984         YouPornIE(),
985         GooglePlusIE(),
986         ArteTvIE(),
987         NBAIE(),
988         WorldStarHipHopIE(),
989         JustinTVIE(),
990         FunnyOrDieIE(),
991         SteamIE(),
992         UstreamIE(),
993         RBMARadioIE(),
994         EightTracksIE(),
995         KeekIE(),
996         TEDIE(),
997         MySpassIE(),
998         SpiegelIE(),
999         LiveLeakIE(),
1000         ARDIE(),
1001         ZDFIE(),
1002         TumblrIE(),
1003         BandcampIE(),
1004         RedTubeIE(),
1005         InaIE(),
1006         HowcastIE(),
1007         VineIE(),
1008         FlickrIE(),
1009         TeamcocoIE(),
1010         XHamsterIE(),
1011         HypemIE(),
1012         Vbox7IE(),
1013         GametrailersIE(),
1014         StatigramIE(),
1015         GenericIE()
1016     ]
1017
1018 def get_info_extractor(ie_name):
1019     """Returns the info extractor class with the given ie_name"""
1020     return globals()[ie_name+'IE']