[keek] move into own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.eighttracks import EightTracksIE
29 from .extractor.escapist import EscapistIE
30 from .extractor.facebook import FacebookIE
31 from .extractor.funnyordie import FunnyOrDieIE
32 from .extractor.gametrailers import GametrailersIE
33 from .extractor.generic import GenericIE
34 from .extractor.googleplus import GooglePlusIE
35 from .extractor.googlesearch import GoogleSearchIE
36 from .extractor.infoq import InfoQIE
37 from .extractor.justintv import JustinTVIE
38 from .extractor.keek import KeekIE
39 from .extractor.metacafe import MetacafeIE
40 from .extractor.mixcloud import MixcloudIE
41 from .extractor.mtv import MTVIE
42 from .extractor.myvideo import MyVideoIE
43 from .extractor.nba import NBAIE
44 from .extractor.statigram import StatigramIE
45 from .extractor.photobucket import PhotobucketIE
46 from .extractor.pornotube import PornotubeIE
47 from .extractor.rbmaradio import RBMARadioIE
48 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
49 from .extractor.stanfordoc import StanfordOpenClassroomIE
50 from .extractor.steam import SteamIE
51 from .extractor.ted import TEDIE
52 from .extractor.ustream import UstreamIE
53 from .extractor.vimeo import VimeoIE
54 from .extractor.worldstarhiphop import WorldStarHipHopIE
55 from .extractor.xnxx import XNXXIE
56 from .extractor.xvideos import XVideosIE
57 from .extractor.yahoo import YahooIE, YahooSearchIE
58 from .extractor.youjizz import YouJizzIE
59 from .extractor.youku import YoukuIE
60 from .extractor.youporn import YouPornIE
61 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
62 from .extractor.zdf import ZDFIE
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91 class MySpassIE(InfoExtractor):
92     _VALID_URL = r'http://www.myspass.de/.*'
93
94     def _real_extract(self, url):
95         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
96
97         # video id is the last path element of the URL
98         # usually there is a trailing slash, so also try the second but last
99         url_path = compat_urllib_parse_urlparse(url).path
100         url_parent_path, video_id = os.path.split(url_path)
101         if not video_id:
102             _, video_id = os.path.split(url_parent_path)
103
104         # get metadata
105         metadata_url = META_DATA_URL_TEMPLATE % video_id
106         metadata_text = self._download_webpage(metadata_url, video_id)
107         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
108
109         # extract values from metadata
110         url_flv_el = metadata.find('url_flv')
111         if url_flv_el is None:
112             raise ExtractorError(u'Unable to extract download url')
113         video_url = url_flv_el.text
114         extension = os.path.splitext(video_url)[1][1:]
115         title_el = metadata.find('title')
116         if title_el is None:
117             raise ExtractorError(u'Unable to extract title')
118         title = title_el.text
119         format_id_el = metadata.find('format_id')
120         if format_id_el is None:
121             format = ext
122         else:
123             format = format_id_el.text
124         description_el = metadata.find('description')
125         if description_el is not None:
126             description = description_el.text
127         else:
128             description = None
129         imagePreview_el = metadata.find('imagePreview')
130         if imagePreview_el is not None:
131             thumbnail = imagePreview_el.text
132         else:
133             thumbnail = None
134         info = {
135             'id': video_id,
136             'url': video_url,
137             'title': title,
138             'ext': extension,
139             'format': format,
140             'thumbnail': thumbnail,
141             'description': description
142         }
143         return [info]
144
145 class SpiegelIE(InfoExtractor):
146     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
147
148     def _real_extract(self, url):
149         m = re.match(self._VALID_URL, url)
150         video_id = m.group('videoID')
151
152         webpage = self._download_webpage(url, video_id)
153
154         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
155             webpage, u'title')
156
157         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
158         xml_code = self._download_webpage(xml_url, video_id,
159                     note=u'Downloading XML', errnote=u'Failed to download XML')
160
161         idoc = xml.etree.ElementTree.fromstring(xml_code)
162         last_type = idoc[-1]
163         filename = last_type.findall('./filename')[0].text
164         duration = float(last_type.findall('./duration')[0].text)
165
166         video_url = 'http://video2.spiegel.de/flash/' + filename
167         video_ext = filename.rpartition('.')[2]
168         info = {
169             'id': video_id,
170             'url': video_url,
171             'ext': video_ext,
172             'title': video_title,
173             'duration': duration,
174         }
175         return [info]
176
177 class LiveLeakIE(InfoExtractor):
178
179     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
180     IE_NAME = u'liveleak'
181
182     def _real_extract(self, url):
183         mobj = re.match(self._VALID_URL, url)
184         if mobj is None:
185             raise ExtractorError(u'Invalid URL: %s' % url)
186
187         video_id = mobj.group('video_id')
188
189         webpage = self._download_webpage(url, video_id)
190
191         video_url = self._search_regex(r'file: "(.*?)",',
192             webpage, u'video URL')
193
194         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
195             webpage, u'title').replace('LiveLeak.com -', '').strip()
196
197         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
198             webpage, u'description', fatal=False)
199
200         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
201             webpage, u'uploader', fatal=False)
202
203         info = {
204             'id':  video_id,
205             'url': video_url,
206             'ext': 'mp4',
207             'title': video_title,
208             'description': video_description,
209             'uploader': video_uploader
210         }
211
212         return [info]
213
214
215
216 class TumblrIE(InfoExtractor):
217     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
218
219     def _real_extract(self, url):
220         m_url = re.match(self._VALID_URL, url)
221         video_id = m_url.group('id')
222         blog = m_url.group('blog_name')
223
224         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
225         webpage = self._download_webpage(url, video_id)
226
227         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
228         video = re.search(re_video, webpage)
229         if video is None:
230            raise ExtractorError(u'Unable to extract video')
231         video_url = video.group('video_url')
232         ext = video.group('ext')
233
234         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
235             webpage, u'thumbnail', fatal=False)  # We pick the first poster
236         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
237
238         # The only place where you can get a title, it's not complete,
239         # but searching in other places doesn't work for all videos
240         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
241             webpage, u'title', flags=re.DOTALL)
242
243         return [{'id': video_id,
244                  'url': video_url,
245                  'title': video_title,
246                  'thumbnail': video_thumbnail,
247                  'ext': ext
248                  }]
249
250 class BandcampIE(InfoExtractor):
251     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
252
253     def _real_extract(self, url):
254         mobj = re.match(self._VALID_URL, url)
255         title = mobj.group('title')
256         webpage = self._download_webpage(url, title)
257         # We get the link to the free download page
258         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
259         if m_download is None:
260             raise ExtractorError(u'No free songs found')
261
262         download_link = m_download.group(1)
263         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
264                        webpage, re.MULTILINE|re.DOTALL).group('id')
265
266         download_webpage = self._download_webpage(download_link, id,
267                                                   'Downloading free downloads page')
268         # We get the dictionary of the track from some javascrip code
269         info = re.search(r'items: (.*?),$',
270                          download_webpage, re.MULTILINE).group(1)
271         info = json.loads(info)[0]
272         # We pick mp3-320 for now, until format selection can be easily implemented.
273         mp3_info = info[u'downloads'][u'mp3-320']
274         # If we try to use this url it says the link has expired
275         initial_url = mp3_info[u'url']
276         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
277         m_url = re.match(re_url, initial_url)
278         #We build the url we will use to get the final track url
279         # This url is build in Bandcamp in the script download_bunde_*.js
280         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
281         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
282         # If we could correctly generate the .rand field the url would be
283         #in the "download_url" key
284         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
285
286         track_info = {'id':id,
287                       'title' : info[u'title'],
288                       'ext' :   'mp3',
289                       'url' :   final_url,
290                       'thumbnail' : info[u'thumb_url'],
291                       'uploader' :  info[u'artist']
292                       }
293
294         return [track_info]
295
296 class RedTubeIE(InfoExtractor):
297     """Information Extractor for redtube"""
298     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
299
300     def _real_extract(self,url):
301         mobj = re.match(self._VALID_URL, url)
302         if mobj is None:
303             raise ExtractorError(u'Invalid URL: %s' % url)
304
305         video_id = mobj.group('id')
306         video_extension = 'mp4'        
307         webpage = self._download_webpage(url, video_id)
308
309         self.report_extraction(video_id)
310
311         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
312             webpage, u'video URL')
313
314         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
315             webpage, u'title')
316
317         return [{
318             'id':       video_id,
319             'url':      video_url,
320             'ext':      video_extension,
321             'title':    video_title,
322         }]
323         
324 class InaIE(InfoExtractor):
325     """Information Extractor for Ina.fr"""
326     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
327
328     def _real_extract(self,url):
329         mobj = re.match(self._VALID_URL, url)
330
331         video_id = mobj.group('id')
332         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
333         video_extension = 'mp4'
334         webpage = self._download_webpage(mrss_url, video_id)
335
336         self.report_extraction(video_id)
337
338         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
339             webpage, u'video URL')
340
341         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
342             webpage, u'title')
343
344         return [{
345             'id':       video_id,
346             'url':      video_url,
347             'ext':      video_extension,
348             'title':    video_title,
349         }]
350
351 class HowcastIE(InfoExtractor):
352     """Information Extractor for Howcast.com"""
353     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
354
355     def _real_extract(self, url):
356         mobj = re.match(self._VALID_URL, url)
357
358         video_id = mobj.group('id')
359         webpage_url = 'http://www.howcast.com/videos/' + video_id
360         webpage = self._download_webpage(webpage_url, video_id)
361
362         self.report_extraction(video_id)
363
364         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
365             webpage, u'video URL')
366
367         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
368             webpage, u'title')
369
370         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
371             webpage, u'description', fatal=False)
372
373         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
374             webpage, u'thumbnail', fatal=False)
375
376         return [{
377             'id':       video_id,
378             'url':      video_url,
379             'ext':      'mp4',
380             'title':    video_title,
381             'description': video_description,
382             'thumbnail': thumbnail,
383         }]
384
385 class VineIE(InfoExtractor):
386     """Information Extractor for Vine.co"""
387     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
388
389     def _real_extract(self, url):
390         mobj = re.match(self._VALID_URL, url)
391
392         video_id = mobj.group('id')
393         webpage_url = 'https://vine.co/v/' + video_id
394         webpage = self._download_webpage(webpage_url, video_id)
395
396         self.report_extraction(video_id)
397
398         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
399             webpage, u'video URL')
400
401         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
402             webpage, u'title')
403
404         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
405             webpage, u'thumbnail', fatal=False)
406
407         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
408             webpage, u'uploader', fatal=False, flags=re.DOTALL)
409
410         return [{
411             'id':        video_id,
412             'url':       video_url,
413             'ext':       'mp4',
414             'title':     video_title,
415             'thumbnail': thumbnail,
416             'uploader':  uploader,
417         }]
418
419 class FlickrIE(InfoExtractor):
420     """Information Extractor for Flickr videos"""
421     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
422
423     def _real_extract(self, url):
424         mobj = re.match(self._VALID_URL, url)
425
426         video_id = mobj.group('id')
427         video_uploader_id = mobj.group('uploader_id')
428         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
429         webpage = self._download_webpage(webpage_url, video_id)
430
431         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
432
433         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
434         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
435
436         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
437             first_xml, u'node_id')
438
439         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
440         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
441
442         self.report_extraction(video_id)
443
444         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
445         if mobj is None:
446             raise ExtractorError(u'Unable to extract video url')
447         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
448
449         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
450             webpage, u'video title')
451
452         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
453             webpage, u'description', fatal=False)
454
455         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
456             webpage, u'thumbnail', fatal=False)
457
458         return [{
459             'id':          video_id,
460             'url':         video_url,
461             'ext':         'mp4',
462             'title':       video_title,
463             'description': video_description,
464             'thumbnail':   thumbnail,
465             'uploader_id': video_uploader_id,
466         }]
467
468 class TeamcocoIE(InfoExtractor):
469     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
470
471     def _real_extract(self, url):
472         mobj = re.match(self._VALID_URL, url)
473         if mobj is None:
474             raise ExtractorError(u'Invalid URL: %s' % url)
475         url_title = mobj.group('url_title')
476         webpage = self._download_webpage(url, url_title)
477
478         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
479             webpage, u'video id')
480
481         self.report_extraction(video_id)
482
483         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
484             webpage, u'title')
485
486         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
487             webpage, u'thumbnail', fatal=False)
488
489         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
490             webpage, u'description', fatal=False)
491
492         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
493         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
494
495         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
496             data, u'video URL')
497
498         return [{
499             'id':          video_id,
500             'url':         video_url,
501             'ext':         'mp4',
502             'title':       video_title,
503             'thumbnail':   thumbnail,
504             'description': video_description,
505         }]
506
507 class XHamsterIE(InfoExtractor):
508     """Information Extractor for xHamster"""
509     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
510
511     def _real_extract(self,url):
512         mobj = re.match(self._VALID_URL, url)
513
514         video_id = mobj.group('id')
515         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
516         webpage = self._download_webpage(mrss_url, video_id)
517
518         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
519         if mobj is None:
520             raise ExtractorError(u'Unable to extract media URL')
521         if len(mobj.group('server')) == 0:
522             video_url = compat_urllib_parse.unquote(mobj.group('file'))
523         else:
524             video_url = mobj.group('server')+'/key='+mobj.group('file')
525         video_extension = video_url.split('.')[-1]
526
527         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
528             webpage, u'title')
529
530         # Can't see the description anywhere in the UI
531         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
532         #     webpage, u'description', fatal=False)
533         # if video_description: video_description = unescapeHTML(video_description)
534
535         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
536         if mobj:
537             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
538         else:
539             video_upload_date = None
540             self._downloader.report_warning(u'Unable to extract upload date')
541
542         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
543             webpage, u'uploader id', default=u'anonymous')
544
545         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
546             webpage, u'thumbnail', fatal=False)
547
548         return [{
549             'id':       video_id,
550             'url':      video_url,
551             'ext':      video_extension,
552             'title':    video_title,
553             # 'description': video_description,
554             'upload_date': video_upload_date,
555             'uploader_id': video_uploader_id,
556             'thumbnail': video_thumbnail
557         }]
558
559 class HypemIE(InfoExtractor):
560     """Information Extractor for hypem"""
561     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
562
563     def _real_extract(self, url):
564         mobj = re.match(self._VALID_URL, url)
565         if mobj is None:
566             raise ExtractorError(u'Invalid URL: %s' % url)
567         track_id = mobj.group(1)
568
569         data = { 'ax': 1, 'ts': time.time() }
570         data_encoded = compat_urllib_parse.urlencode(data)
571         complete_url = url + "?" + data_encoded
572         request = compat_urllib_request.Request(complete_url)
573         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
574         cookie = urlh.headers.get('Set-Cookie', '')
575
576         self.report_extraction(track_id)
577
578         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
579             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
580         try:
581             track_list = json.loads(html_tracks)
582             track = track_list[u'tracks'][0]
583         except ValueError:
584             raise ExtractorError(u'Hypemachine contained invalid JSON.')
585
586         key = track[u"key"]
587         track_id = track[u"id"]
588         artist = track[u"artist"]
589         title = track[u"song"]
590
591         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
592         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
593         request.add_header('cookie', cookie)
594         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
595         try:
596             song_data = json.loads(song_data_json)
597         except ValueError:
598             raise ExtractorError(u'Hypemachine contained invalid JSON.')
599         final_url = song_data[u"url"]
600
601         return [{
602             'id':       track_id,
603             'url':      final_url,
604             'ext':      "mp3",
605             'title':    title,
606             'artist':   artist,
607         }]
608
609 class Vbox7IE(InfoExtractor):
610     """Information Extractor for Vbox7"""
611     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
612
613     def _real_extract(self,url):
614         mobj = re.match(self._VALID_URL, url)
615         if mobj is None:
616             raise ExtractorError(u'Invalid URL: %s' % url)
617         video_id = mobj.group(1)
618
619         redirect_page, urlh = self._download_webpage_handle(url, video_id)
620         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
621         redirect_url = urlh.geturl() + new_location
622         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
623
624         title = self._html_search_regex(r'<title>(.*)</title>',
625             webpage, u'title').split('/')[0].strip()
626
627         ext = "flv"
628         info_url = "http://vbox7.com/play/magare.do"
629         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
630         info_request = compat_urllib_request.Request(info_url, data)
631         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
632         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
633         if info_response is None:
634             raise ExtractorError(u'Unable to extract the media url')
635         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
636
637         return [{
638             'id':        video_id,
639             'url':       final_url,
640             'ext':       ext,
641             'title':     title,
642             'thumbnail': thumbnail_url,
643         }]
644
645
646 def gen_extractors():
647     """ Return a list of an instance of every supported extractor.
648     The order does matter; the first extractor matched is the one handling the URL.
649     """
650     return [
651         YoutubePlaylistIE(),
652         YoutubeChannelIE(),
653         YoutubeUserIE(),
654         YoutubeSearchIE(),
655         YoutubeIE(),
656         MetacafeIE(),
657         DailymotionIE(),
658         GoogleSearchIE(),
659         PhotobucketIE(),
660         YahooIE(),
661         YahooSearchIE(),
662         DepositFilesIE(),
663         FacebookIE(),
664         BlipTVIE(),
665         BlipTVUserIE(),
666         VimeoIE(),
667         MyVideoIE(),
668         ComedyCentralIE(),
669         EscapistIE(),
670         CollegeHumorIE(),
671         XVideosIE(),
672         SoundcloudSetIE(),
673         SoundcloudIE(),
674         InfoQIE(),
675         MixcloudIE(),
676         StanfordOpenClassroomIE(),
677         MTVIE(),
678         YoukuIE(),
679         XNXXIE(),
680         YouJizzIE(),
681         PornotubeIE(),
682         YouPornIE(),
683         GooglePlusIE(),
684         ArteTvIE(),
685         NBAIE(),
686         WorldStarHipHopIE(),
687         JustinTVIE(),
688         FunnyOrDieIE(),
689         SteamIE(),
690         UstreamIE(),
691         RBMARadioIE(),
692         EightTracksIE(),
693         KeekIE(),
694         TEDIE(),
695         MySpassIE(),
696         SpiegelIE(),
697         LiveLeakIE(),
698         ARDIE(),
699         ZDFIE(),
700         TumblrIE(),
701         BandcampIE(),
702         RedTubeIE(),
703         InaIE(),
704         HowcastIE(),
705         VineIE(),
706         FlickrIE(),
707         TeamcocoIE(),
708         XHamsterIE(),
709         HypemIE(),
710         Vbox7IE(),
711         GametrailersIE(),
712         StatigramIE(),
713         GenericIE()
714     ]
715
716 def get_info_extractor(ie_name):
717     """Returns the info extractor class with the given ie_name"""
718     return globals()[ie_name+'IE']