Move Youku IE into its own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.mixcloud import MixcloudIE
37 from .extractor.mtv import MTVIE
38 from .extractor.myvideo import MyVideoIE
39 from .extractor.nba import NBAIE
40 from .extractor.statigram import StatigramIE
41 from .extractor.photobucket import PhotobucketIE
42 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
43 from .extractor.stanfordoc import StanfordOpenClassroomIE
44 from .extractor.ted import TEDIE
45 from .extractor.vimeo import VimeoIE
46 from .extractor.xvideos import XVideosIE
47 from .extractor.yahoo import YahooIE, YahooSearchIE
48 from .extractor.youku import YoukuIE
49 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
50 from .extractor.zdf import ZDFIE
51
52
53
54
55
56
57
58
59
60 class XNXXIE(InfoExtractor):
61     """Information extractor for xnxx.com"""
62
63     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
64     IE_NAME = u'xnxx'
65     VIDEO_URL_RE = r'flv_url=(.*?)&'
66     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
67     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
68
69     def _real_extract(self, url):
70         mobj = re.match(self._VALID_URL, url)
71         if mobj is None:
72             raise ExtractorError(u'Invalid URL: %s' % url)
73         video_id = mobj.group(1)
74
75         # Get webpage content
76         webpage = self._download_webpage(url, video_id)
77
78         video_url = self._search_regex(self.VIDEO_URL_RE,
79             webpage, u'video URL')
80         video_url = compat_urllib_parse.unquote(video_url)
81
82         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
83             webpage, u'title')
84
85         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
86             webpage, u'thumbnail', fatal=False)
87
88         return [{
89             'id': video_id,
90             'url': video_url,
91             'uploader': None,
92             'upload_date': None,
93             'title': video_title,
94             'ext': 'flv',
95             'thumbnail': video_thumbnail,
96             'description': None,
97         }]
98
99
100
101
102 class JustinTVIE(InfoExtractor):
103     """Information extractor for justin.tv and twitch.tv"""
104     # TODO: One broadcast may be split into multiple videos. The key
105     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
106     # starts at 1 and increases. Can we treat all parts as one video?
107
108     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
109         (?:
110             (?P<channelid>[^/]+)|
111             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
112             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
113         )
114         /?(?:\#.*)?$
115         """
116     _JUSTIN_PAGE_LIMIT = 100
117     IE_NAME = u'justin.tv'
118
119     def report_download_page(self, channel, offset):
120         """Report attempt to download a single page of videos."""
121         self.to_screen(u'%s: Downloading video information from %d to %d' %
122                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
123
124     # Return count of items, list of *valid* items
125     def _parse_page(self, url, video_id):
126         webpage = self._download_webpage(url, video_id,
127                                          u'Downloading video info JSON',
128                                          u'unable to download video info JSON')
129
130         response = json.loads(webpage)
131         if type(response) != list:
132             error_text = response.get('error', 'unknown error')
133             raise ExtractorError(u'Justin.tv API: %s' % error_text)
134         info = []
135         for clip in response:
136             video_url = clip['video_file_url']
137             if video_url:
138                 video_extension = os.path.splitext(video_url)[1][1:]
139                 video_date = re.sub('-', '', clip['start_time'][:10])
140                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
141                 video_id = clip['id']
142                 video_title = clip.get('title', video_id)
143                 info.append({
144                     'id': video_id,
145                     'url': video_url,
146                     'title': video_title,
147                     'uploader': clip.get('channel_name', video_uploader_id),
148                     'uploader_id': video_uploader_id,
149                     'upload_date': video_date,
150                     'ext': video_extension,
151                 })
152         return (len(response), info)
153
154     def _real_extract(self, url):
155         mobj = re.match(self._VALID_URL, url)
156         if mobj is None:
157             raise ExtractorError(u'invalid URL: %s' % url)
158
159         api_base = 'http://api.justin.tv'
160         paged = False
161         if mobj.group('channelid'):
162             paged = True
163             video_id = mobj.group('channelid')
164             api = api_base + '/channel/archives/%s.json' % video_id
165         elif mobj.group('chapterid'):
166             chapter_id = mobj.group('chapterid')
167
168             webpage = self._download_webpage(url, chapter_id)
169             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
170             if not m:
171                 raise ExtractorError(u'Cannot find archive of a chapter')
172             archive_id = m.group(1)
173
174             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
175             chapter_info_xml = self._download_webpage(api, chapter_id,
176                                              note=u'Downloading chapter information',
177                                              errnote=u'Chapter information download failed')
178             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
179             for a in doc.findall('.//archive'):
180                 if archive_id == a.find('./id').text:
181                     break
182             else:
183                 raise ExtractorError(u'Could not find chapter in chapter information')
184
185             video_url = a.find('./video_file_url').text
186             video_ext = video_url.rpartition('.')[2] or u'flv'
187
188             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
189             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
190                                    note='Downloading chapter metadata',
191                                    errnote='Download of chapter metadata failed')
192             chapter_info = json.loads(chapter_info_json)
193
194             bracket_start = int(doc.find('.//bracket_start').text)
195             bracket_end = int(doc.find('.//bracket_end').text)
196
197             # TODO determine start (and probably fix up file)
198             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
199             #video_url += u'?start=' + TODO:start_timestamp
200             # bracket_start is 13290, but we want 51670615
201             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
202                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
203
204             info = {
205                 'id': u'c' + chapter_id,
206                 'url': video_url,
207                 'ext': video_ext,
208                 'title': chapter_info['title'],
209                 'thumbnail': chapter_info['preview'],
210                 'description': chapter_info['description'],
211                 'uploader': chapter_info['channel']['display_name'],
212                 'uploader_id': chapter_info['channel']['name'],
213             }
214             return [info]
215         else:
216             video_id = mobj.group('videoid')
217             api = api_base + '/broadcast/by_archive/%s.json' % video_id
218
219         self.report_extraction(video_id)
220
221         info = []
222         offset = 0
223         limit = self._JUSTIN_PAGE_LIMIT
224         while True:
225             if paged:
226                 self.report_download_page(video_id, offset)
227             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
228             page_count, page_info = self._parse_page(page_url, video_id)
229             info.extend(page_info)
230             if not paged or page_count != limit:
231                 break
232             offset += limit
233         return info
234
235 class FunnyOrDieIE(InfoExtractor):
236     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
237
238     def _real_extract(self, url):
239         mobj = re.match(self._VALID_URL, url)
240         if mobj is None:
241             raise ExtractorError(u'invalid URL: %s' % url)
242
243         video_id = mobj.group('id')
244         webpage = self._download_webpage(url, video_id)
245
246         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
247             webpage, u'video URL', flags=re.DOTALL)
248
249         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
250             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
251
252         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
253             webpage, u'description', fatal=False, flags=re.DOTALL)
254
255         info = {
256             'id': video_id,
257             'url': video_url,
258             'ext': 'mp4',
259             'title': title,
260             'description': video_description,
261         }
262         return [info]
263
264 class SteamIE(InfoExtractor):
265     _VALID_URL = r"""http://store\.steampowered\.com/
266                 (agecheck/)?
267                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
268                 (?P<gameID>\d+)/?
269                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
270                 """
271     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
272     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
273
274     @classmethod
275     def suitable(cls, url):
276         """Receives a URL and returns True if suitable for this IE."""
277         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
278
279     def _real_extract(self, url):
280         m = re.match(self._VALID_URL, url, re.VERBOSE)
281         gameID = m.group('gameID')
282
283         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
284         webpage = self._download_webpage(videourl, gameID)
285
286         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
287             videourl = self._AGECHECK_TEMPLATE % gameID
288             self.report_age_confirmation()
289             webpage = self._download_webpage(videourl, gameID)
290
291         self.report_extraction(gameID)
292         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
293                                              webpage, 'game title')
294
295         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
296         mweb = re.finditer(urlRE, webpage)
297         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
298         titles = re.finditer(namesRE, webpage)
299         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
300         thumbs = re.finditer(thumbsRE, webpage)
301         videos = []
302         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
303             video_id = vid.group('videoID')
304             title = vtitle.group('videoName')
305             video_url = vid.group('videoURL')
306             video_thumb = thumb.group('thumbnail')
307             if not video_url:
308                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
309             info = {
310                 'id':video_id,
311                 'url':video_url,
312                 'ext': 'flv',
313                 'title': unescapeHTML(title),
314                 'thumbnail': video_thumb
315                   }
316             videos.append(info)
317         return [self.playlist_result(videos, gameID, game_title)]
318
319 class UstreamIE(InfoExtractor):
320     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
321     IE_NAME = u'ustream'
322
323     def _real_extract(self, url):
324         m = re.match(self._VALID_URL, url)
325         video_id = m.group('videoID')
326
327         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
328         webpage = self._download_webpage(url, video_id)
329
330         self.report_extraction(video_id)
331
332         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
333             webpage, u'title')
334
335         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
336             webpage, u'uploader', fatal=False, flags=re.DOTALL)
337
338         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
339             webpage, u'thumbnail', fatal=False)
340
341         info = {
342                 'id': video_id,
343                 'url': video_url,
344                 'ext': 'flv',
345                 'title': video_title,
346                 'uploader': uploader,
347                 'thumbnail': thumbnail,
348                }
349         return info
350
351 class WorldStarHipHopIE(InfoExtractor):
352     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
353     IE_NAME = u'WorldStarHipHop'
354
355     def _real_extract(self, url):
356         m = re.match(self._VALID_URL, url)
357         video_id = m.group('id')
358
359         webpage_src = self._download_webpage(url, video_id)
360
361         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
362             webpage_src, u'video URL')
363
364         if 'mp4' in video_url:
365             ext = 'mp4'
366         else:
367             ext = 'flv'
368
369         video_title = self._html_search_regex(r"<title>(.*)</title>",
370             webpage_src, u'title')
371
372         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
373         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
374             webpage_src, u'thumbnail', fatal=False)
375
376         if not thumbnail:
377             _title = r"""candytitles.*>(.*)</span>"""
378             mobj = re.search(_title, webpage_src)
379             if mobj is not None:
380                 video_title = mobj.group(1)
381
382         results = [{
383                     'id': video_id,
384                     'url' : video_url,
385                     'title' : video_title,
386                     'thumbnail' : thumbnail,
387                     'ext' : ext,
388                     }]
389         return results
390
391 class RBMARadioIE(InfoExtractor):
392     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
393
394     def _real_extract(self, url):
395         m = re.match(self._VALID_URL, url)
396         video_id = m.group('videoID')
397
398         webpage = self._download_webpage(url, video_id)
399
400         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
401             webpage, u'json data', flags=re.MULTILINE)
402
403         try:
404             data = json.loads(json_data)
405         except ValueError as e:
406             raise ExtractorError(u'Invalid JSON: ' + str(e))
407
408         video_url = data['akamai_url'] + '&cbr=256'
409         url_parts = compat_urllib_parse_urlparse(video_url)
410         video_ext = url_parts.path.rpartition('.')[2]
411         info = {
412                 'id': video_id,
413                 'url': video_url,
414                 'ext': video_ext,
415                 'title': data['title'],
416                 'description': data.get('teaser_text'),
417                 'location': data.get('country_of_origin'),
418                 'uploader': data.get('host', {}).get('name'),
419                 'uploader_id': data.get('host', {}).get('slug'),
420                 'thumbnail': data.get('image', {}).get('large_url_2x'),
421                 'duration': data.get('duration'),
422         }
423         return [info]
424
425
426 class YouPornIE(InfoExtractor):
427     """Information extractor for youporn.com."""
428     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
429
430     def _print_formats(self, formats):
431         """Print all available formats"""
432         print(u'Available formats:')
433         print(u'ext\t\tformat')
434         print(u'---------------------------------')
435         for format in formats:
436             print(u'%s\t\t%s'  % (format['ext'], format['format']))
437
438     def _specific(self, req_format, formats):
439         for x in formats:
440             if(x["format"]==req_format):
441                 return x
442         return None
443
444     def _real_extract(self, url):
445         mobj = re.match(self._VALID_URL, url)
446         if mobj is None:
447             raise ExtractorError(u'Invalid URL: %s' % url)
448         video_id = mobj.group('videoid')
449
450         req = compat_urllib_request.Request(url)
451         req.add_header('Cookie', 'age_verified=1')
452         webpage = self._download_webpage(req, video_id)
453
454         # Get JSON parameters
455         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
456         try:
457             params = json.loads(json_params)
458         except:
459             raise ExtractorError(u'Invalid JSON')
460
461         self.report_extraction(video_id)
462         try:
463             video_title = params['title']
464             upload_date = unified_strdate(params['release_date_f'])
465             video_description = params['description']
466             video_uploader = params['submitted_by']
467             thumbnail = params['thumbnails'][0]['image']
468         except KeyError:
469             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
470
471         # Get all of the formats available
472         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
473         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
474             webpage, u'download list').strip()
475
476         # Get all of the links from the page
477         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
478         links = re.findall(LINK_RE, download_list_html)
479         if(len(links) == 0):
480             raise ExtractorError(u'ERROR: no known formats available for video')
481
482         self.to_screen(u'Links found: %d' % len(links))
483
484         formats = []
485         for link in links:
486
487             # A link looks like this:
488             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
489             # A path looks like this:
490             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
491             video_url = unescapeHTML( link )
492             path = compat_urllib_parse_urlparse( video_url ).path
493             extension = os.path.splitext( path )[1][1:]
494             format = path.split('/')[4].split('_')[:2]
495             size = format[0]
496             bitrate = format[1]
497             format = "-".join( format )
498             # title = u'%s-%s-%s' % (video_title, size, bitrate)
499
500             formats.append({
501                 'id': video_id,
502                 'url': video_url,
503                 'uploader': video_uploader,
504                 'upload_date': upload_date,
505                 'title': video_title,
506                 'ext': extension,
507                 'format': format,
508                 'thumbnail': thumbnail,
509                 'description': video_description
510             })
511
512         if self._downloader.params.get('listformats', None):
513             self._print_formats(formats)
514             return
515
516         req_format = self._downloader.params.get('format', None)
517         self.to_screen(u'Format: %s' % req_format)
518
519         if req_format is None or req_format == 'best':
520             return [formats[0]]
521         elif req_format == 'worst':
522             return [formats[-1]]
523         elif req_format in ('-1', 'all'):
524             return formats
525         else:
526             format = self._specific( req_format, formats )
527             if result is None:
528                 raise ExtractorError(u'Requested format not available')
529             return [format]
530
531
532
533 class PornotubeIE(InfoExtractor):
534     """Information extractor for pornotube.com."""
535     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
536
537     def _real_extract(self, url):
538         mobj = re.match(self._VALID_URL, url)
539         if mobj is None:
540             raise ExtractorError(u'Invalid URL: %s' % url)
541
542         video_id = mobj.group('videoid')
543         video_title = mobj.group('title')
544
545         # Get webpage content
546         webpage = self._download_webpage(url, video_id)
547
548         # Get the video URL
549         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
550         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
551         video_url = compat_urllib_parse.unquote(video_url)
552
553         #Get the uploaded date
554         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
555         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
556         if upload_date: upload_date = unified_strdate(upload_date)
557
558         info = {'id': video_id,
559                 'url': video_url,
560                 'uploader': None,
561                 'upload_date': upload_date,
562                 'title': video_title,
563                 'ext': 'flv',
564                 'format': 'flv'}
565
566         return [info]
567
568 class YouJizzIE(InfoExtractor):
569     """Information extractor for youjizz.com."""
570     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
571
572     def _real_extract(self, url):
573         mobj = re.match(self._VALID_URL, url)
574         if mobj is None:
575             raise ExtractorError(u'Invalid URL: %s' % url)
576
577         video_id = mobj.group('videoid')
578
579         # Get webpage content
580         webpage = self._download_webpage(url, video_id)
581
582         # Get the video title
583         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
584             webpage, u'title').strip()
585
586         # Get the embed page
587         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
588         if result is None:
589             raise ExtractorError(u'ERROR: unable to extract embed page')
590
591         embed_page_url = result.group(0).strip()
592         video_id = result.group('videoid')
593
594         webpage = self._download_webpage(embed_page_url, video_id)
595
596         # Get the video URL
597         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
598             webpage, u'video URL')
599
600         info = {'id': video_id,
601                 'url': video_url,
602                 'title': video_title,
603                 'ext': 'flv',
604                 'format': 'flv',
605                 'player_url': embed_page_url}
606
607         return [info]
608
609 class EightTracksIE(InfoExtractor):
610     IE_NAME = '8tracks'
611     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
612
613     def _real_extract(self, url):
614         mobj = re.match(self._VALID_URL, url)
615         if mobj is None:
616             raise ExtractorError(u'Invalid URL: %s' % url)
617         playlist_id = mobj.group('id')
618
619         webpage = self._download_webpage(url, playlist_id)
620
621         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
622         data = json.loads(json_like)
623
624         session = str(random.randint(0, 1000000000))
625         mix_id = data['id']
626         track_count = data['tracks_count']
627         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
628         next_url = first_url
629         res = []
630         for i in itertools.count():
631             api_json = self._download_webpage(next_url, playlist_id,
632                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
633                 errnote=u'Failed to download song information')
634             api_data = json.loads(api_json)
635             track_data = api_data[u'set']['track']
636             info = {
637                 'id': track_data['id'],
638                 'url': track_data['track_file_stream_url'],
639                 'title': track_data['performer'] + u' - ' + track_data['name'],
640                 'raw_title': track_data['name'],
641                 'uploader_id': data['user']['login'],
642                 'ext': 'm4a',
643             }
644             res.append(info)
645             if api_data['set']['at_last_track']:
646                 break
647             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
648         return res
649
650 class KeekIE(InfoExtractor):
651     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
652     IE_NAME = u'keek'
653
654     def _real_extract(self, url):
655         m = re.match(self._VALID_URL, url)
656         video_id = m.group('videoID')
657
658         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
659         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
660         webpage = self._download_webpage(url, video_id)
661
662         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
663             webpage, u'title')
664
665         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
666             webpage, u'uploader', fatal=False)
667
668         info = {
669                 'id': video_id,
670                 'url': video_url,
671                 'ext': 'mp4',
672                 'title': video_title,
673                 'thumbnail': thumbnail,
674                 'uploader': uploader
675         }
676         return [info]
677
678
679 class MySpassIE(InfoExtractor):
680     _VALID_URL = r'http://www.myspass.de/.*'
681
682     def _real_extract(self, url):
683         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
684
685         # video id is the last path element of the URL
686         # usually there is a trailing slash, so also try the second but last
687         url_path = compat_urllib_parse_urlparse(url).path
688         url_parent_path, video_id = os.path.split(url_path)
689         if not video_id:
690             _, video_id = os.path.split(url_parent_path)
691
692         # get metadata
693         metadata_url = META_DATA_URL_TEMPLATE % video_id
694         metadata_text = self._download_webpage(metadata_url, video_id)
695         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
696
697         # extract values from metadata
698         url_flv_el = metadata.find('url_flv')
699         if url_flv_el is None:
700             raise ExtractorError(u'Unable to extract download url')
701         video_url = url_flv_el.text
702         extension = os.path.splitext(video_url)[1][1:]
703         title_el = metadata.find('title')
704         if title_el is None:
705             raise ExtractorError(u'Unable to extract title')
706         title = title_el.text
707         format_id_el = metadata.find('format_id')
708         if format_id_el is None:
709             format = ext
710         else:
711             format = format_id_el.text
712         description_el = metadata.find('description')
713         if description_el is not None:
714             description = description_el.text
715         else:
716             description = None
717         imagePreview_el = metadata.find('imagePreview')
718         if imagePreview_el is not None:
719             thumbnail = imagePreview_el.text
720         else:
721             thumbnail = None
722         info = {
723             'id': video_id,
724             'url': video_url,
725             'title': title,
726             'ext': extension,
727             'format': format,
728             'thumbnail': thumbnail,
729             'description': description
730         }
731         return [info]
732
733 class SpiegelIE(InfoExtractor):
734     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
735
736     def _real_extract(self, url):
737         m = re.match(self._VALID_URL, url)
738         video_id = m.group('videoID')
739
740         webpage = self._download_webpage(url, video_id)
741
742         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
743             webpage, u'title')
744
745         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
746         xml_code = self._download_webpage(xml_url, video_id,
747                     note=u'Downloading XML', errnote=u'Failed to download XML')
748
749         idoc = xml.etree.ElementTree.fromstring(xml_code)
750         last_type = idoc[-1]
751         filename = last_type.findall('./filename')[0].text
752         duration = float(last_type.findall('./duration')[0].text)
753
754         video_url = 'http://video2.spiegel.de/flash/' + filename
755         video_ext = filename.rpartition('.')[2]
756         info = {
757             'id': video_id,
758             'url': video_url,
759             'ext': video_ext,
760             'title': video_title,
761             'duration': duration,
762         }
763         return [info]
764
765 class LiveLeakIE(InfoExtractor):
766
767     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
768     IE_NAME = u'liveleak'
769
770     def _real_extract(self, url):
771         mobj = re.match(self._VALID_URL, url)
772         if mobj is None:
773             raise ExtractorError(u'Invalid URL: %s' % url)
774
775         video_id = mobj.group('video_id')
776
777         webpage = self._download_webpage(url, video_id)
778
779         video_url = self._search_regex(r'file: "(.*?)",',
780             webpage, u'video URL')
781
782         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
783             webpage, u'title').replace('LiveLeak.com -', '').strip()
784
785         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
786             webpage, u'description', fatal=False)
787
788         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
789             webpage, u'uploader', fatal=False)
790
791         info = {
792             'id':  video_id,
793             'url': video_url,
794             'ext': 'mp4',
795             'title': video_title,
796             'description': video_description,
797             'uploader': video_uploader
798         }
799
800         return [info]
801
802
803
804 class TumblrIE(InfoExtractor):
805     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
806
807     def _real_extract(self, url):
808         m_url = re.match(self._VALID_URL, url)
809         video_id = m_url.group('id')
810         blog = m_url.group('blog_name')
811
812         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
813         webpage = self._download_webpage(url, video_id)
814
815         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
816         video = re.search(re_video, webpage)
817         if video is None:
818            raise ExtractorError(u'Unable to extract video')
819         video_url = video.group('video_url')
820         ext = video.group('ext')
821
822         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
823             webpage, u'thumbnail', fatal=False)  # We pick the first poster
824         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
825
826         # The only place where you can get a title, it's not complete,
827         # but searching in other places doesn't work for all videos
828         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
829             webpage, u'title', flags=re.DOTALL)
830
831         return [{'id': video_id,
832                  'url': video_url,
833                  'title': video_title,
834                  'thumbnail': video_thumbnail,
835                  'ext': ext
836                  }]
837
838 class BandcampIE(InfoExtractor):
839     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
840
841     def _real_extract(self, url):
842         mobj = re.match(self._VALID_URL, url)
843         title = mobj.group('title')
844         webpage = self._download_webpage(url, title)
845         # We get the link to the free download page
846         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
847         if m_download is None:
848             raise ExtractorError(u'No free songs found')
849
850         download_link = m_download.group(1)
851         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
852                        webpage, re.MULTILINE|re.DOTALL).group('id')
853
854         download_webpage = self._download_webpage(download_link, id,
855                                                   'Downloading free downloads page')
856         # We get the dictionary of the track from some javascrip code
857         info = re.search(r'items: (.*?),$',
858                          download_webpage, re.MULTILINE).group(1)
859         info = json.loads(info)[0]
860         # We pick mp3-320 for now, until format selection can be easily implemented.
861         mp3_info = info[u'downloads'][u'mp3-320']
862         # If we try to use this url it says the link has expired
863         initial_url = mp3_info[u'url']
864         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
865         m_url = re.match(re_url, initial_url)
866         #We build the url we will use to get the final track url
867         # This url is build in Bandcamp in the script download_bunde_*.js
868         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
869         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
870         # If we could correctly generate the .rand field the url would be
871         #in the "download_url" key
872         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
873
874         track_info = {'id':id,
875                       'title' : info[u'title'],
876                       'ext' :   'mp3',
877                       'url' :   final_url,
878                       'thumbnail' : info[u'thumb_url'],
879                       'uploader' :  info[u'artist']
880                       }
881
882         return [track_info]
883
884 class RedTubeIE(InfoExtractor):
885     """Information Extractor for redtube"""
886     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
887
888     def _real_extract(self,url):
889         mobj = re.match(self._VALID_URL, url)
890         if mobj is None:
891             raise ExtractorError(u'Invalid URL: %s' % url)
892
893         video_id = mobj.group('id')
894         video_extension = 'mp4'        
895         webpage = self._download_webpage(url, video_id)
896
897         self.report_extraction(video_id)
898
899         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
900             webpage, u'video URL')
901
902         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
903             webpage, u'title')
904
905         return [{
906             'id':       video_id,
907             'url':      video_url,
908             'ext':      video_extension,
909             'title':    video_title,
910         }]
911         
912 class InaIE(InfoExtractor):
913     """Information Extractor for Ina.fr"""
914     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
915
916     def _real_extract(self,url):
917         mobj = re.match(self._VALID_URL, url)
918
919         video_id = mobj.group('id')
920         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
921         video_extension = 'mp4'
922         webpage = self._download_webpage(mrss_url, video_id)
923
924         self.report_extraction(video_id)
925
926         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
927             webpage, u'video URL')
928
929         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
930             webpage, u'title')
931
932         return [{
933             'id':       video_id,
934             'url':      video_url,
935             'ext':      video_extension,
936             'title':    video_title,
937         }]
938
939 class HowcastIE(InfoExtractor):
940     """Information Extractor for Howcast.com"""
941     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
942
943     def _real_extract(self, url):
944         mobj = re.match(self._VALID_URL, url)
945
946         video_id = mobj.group('id')
947         webpage_url = 'http://www.howcast.com/videos/' + video_id
948         webpage = self._download_webpage(webpage_url, video_id)
949
950         self.report_extraction(video_id)
951
952         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
953             webpage, u'video URL')
954
955         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
956             webpage, u'title')
957
958         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
959             webpage, u'description', fatal=False)
960
961         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
962             webpage, u'thumbnail', fatal=False)
963
964         return [{
965             'id':       video_id,
966             'url':      video_url,
967             'ext':      'mp4',
968             'title':    video_title,
969             'description': video_description,
970             'thumbnail': thumbnail,
971         }]
972
973 class VineIE(InfoExtractor):
974     """Information Extractor for Vine.co"""
975     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
976
977     def _real_extract(self, url):
978         mobj = re.match(self._VALID_URL, url)
979
980         video_id = mobj.group('id')
981         webpage_url = 'https://vine.co/v/' + video_id
982         webpage = self._download_webpage(webpage_url, video_id)
983
984         self.report_extraction(video_id)
985
986         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
987             webpage, u'video URL')
988
989         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
990             webpage, u'title')
991
992         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
993             webpage, u'thumbnail', fatal=False)
994
995         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
996             webpage, u'uploader', fatal=False, flags=re.DOTALL)
997
998         return [{
999             'id':        video_id,
1000             'url':       video_url,
1001             'ext':       'mp4',
1002             'title':     video_title,
1003             'thumbnail': thumbnail,
1004             'uploader':  uploader,
1005         }]
1006
1007 class FlickrIE(InfoExtractor):
1008     """Information Extractor for Flickr videos"""
1009     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1010
1011     def _real_extract(self, url):
1012         mobj = re.match(self._VALID_URL, url)
1013
1014         video_id = mobj.group('id')
1015         video_uploader_id = mobj.group('uploader_id')
1016         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1017         webpage = self._download_webpage(webpage_url, video_id)
1018
1019         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1020
1021         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1022         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1023
1024         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1025             first_xml, u'node_id')
1026
1027         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1028         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1029
1030         self.report_extraction(video_id)
1031
1032         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1033         if mobj is None:
1034             raise ExtractorError(u'Unable to extract video url')
1035         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1036
1037         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1038             webpage, u'video title')
1039
1040         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1041             webpage, u'description', fatal=False)
1042
1043         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1044             webpage, u'thumbnail', fatal=False)
1045
1046         return [{
1047             'id':          video_id,
1048             'url':         video_url,
1049             'ext':         'mp4',
1050             'title':       video_title,
1051             'description': video_description,
1052             'thumbnail':   thumbnail,
1053             'uploader_id': video_uploader_id,
1054         }]
1055
1056 class TeamcocoIE(InfoExtractor):
1057     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1058
1059     def _real_extract(self, url):
1060         mobj = re.match(self._VALID_URL, url)
1061         if mobj is None:
1062             raise ExtractorError(u'Invalid URL: %s' % url)
1063         url_title = mobj.group('url_title')
1064         webpage = self._download_webpage(url, url_title)
1065
1066         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1067             webpage, u'video id')
1068
1069         self.report_extraction(video_id)
1070
1071         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1072             webpage, u'title')
1073
1074         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1075             webpage, u'thumbnail', fatal=False)
1076
1077         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1078             webpage, u'description', fatal=False)
1079
1080         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1081         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1082
1083         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1084             data, u'video URL')
1085
1086         return [{
1087             'id':          video_id,
1088             'url':         video_url,
1089             'ext':         'mp4',
1090             'title':       video_title,
1091             'thumbnail':   thumbnail,
1092             'description': video_description,
1093         }]
1094
1095 class XHamsterIE(InfoExtractor):
1096     """Information Extractor for xHamster"""
1097     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1098
1099     def _real_extract(self,url):
1100         mobj = re.match(self._VALID_URL, url)
1101
1102         video_id = mobj.group('id')
1103         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1104         webpage = self._download_webpage(mrss_url, video_id)
1105
1106         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1107         if mobj is None:
1108             raise ExtractorError(u'Unable to extract media URL')
1109         if len(mobj.group('server')) == 0:
1110             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1111         else:
1112             video_url = mobj.group('server')+'/key='+mobj.group('file')
1113         video_extension = video_url.split('.')[-1]
1114
1115         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1116             webpage, u'title')
1117
1118         # Can't see the description anywhere in the UI
1119         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1120         #     webpage, u'description', fatal=False)
1121         # if video_description: video_description = unescapeHTML(video_description)
1122
1123         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1124         if mobj:
1125             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1126         else:
1127             video_upload_date = None
1128             self._downloader.report_warning(u'Unable to extract upload date')
1129
1130         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1131             webpage, u'uploader id', default=u'anonymous')
1132
1133         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1134             webpage, u'thumbnail', fatal=False)
1135
1136         return [{
1137             'id':       video_id,
1138             'url':      video_url,
1139             'ext':      video_extension,
1140             'title':    video_title,
1141             # 'description': video_description,
1142             'upload_date': video_upload_date,
1143             'uploader_id': video_uploader_id,
1144             'thumbnail': video_thumbnail
1145         }]
1146
1147 class HypemIE(InfoExtractor):
1148     """Information Extractor for hypem"""
1149     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1150
1151     def _real_extract(self, url):
1152         mobj = re.match(self._VALID_URL, url)
1153         if mobj is None:
1154             raise ExtractorError(u'Invalid URL: %s' % url)
1155         track_id = mobj.group(1)
1156
1157         data = { 'ax': 1, 'ts': time.time() }
1158         data_encoded = compat_urllib_parse.urlencode(data)
1159         complete_url = url + "?" + data_encoded
1160         request = compat_urllib_request.Request(complete_url)
1161         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1162         cookie = urlh.headers.get('Set-Cookie', '')
1163
1164         self.report_extraction(track_id)
1165
1166         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1167             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1168         try:
1169             track_list = json.loads(html_tracks)
1170             track = track_list[u'tracks'][0]
1171         except ValueError:
1172             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1173
1174         key = track[u"key"]
1175         track_id = track[u"id"]
1176         artist = track[u"artist"]
1177         title = track[u"song"]
1178
1179         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1180         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1181         request.add_header('cookie', cookie)
1182         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1183         try:
1184             song_data = json.loads(song_data_json)
1185         except ValueError:
1186             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1187         final_url = song_data[u"url"]
1188
1189         return [{
1190             'id':       track_id,
1191             'url':      final_url,
1192             'ext':      "mp3",
1193             'title':    title,
1194             'artist':   artist,
1195         }]
1196
1197 class Vbox7IE(InfoExtractor):
1198     """Information Extractor for Vbox7"""
1199     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1200
1201     def _real_extract(self,url):
1202         mobj = re.match(self._VALID_URL, url)
1203         if mobj is None:
1204             raise ExtractorError(u'Invalid URL: %s' % url)
1205         video_id = mobj.group(1)
1206
1207         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1208         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1209         redirect_url = urlh.geturl() + new_location
1210         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1211
1212         title = self._html_search_regex(r'<title>(.*)</title>',
1213             webpage, u'title').split('/')[0].strip()
1214
1215         ext = "flv"
1216         info_url = "http://vbox7.com/play/magare.do"
1217         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1218         info_request = compat_urllib_request.Request(info_url, data)
1219         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1220         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1221         if info_response is None:
1222             raise ExtractorError(u'Unable to extract the media url')
1223         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1224
1225         return [{
1226             'id':        video_id,
1227             'url':       final_url,
1228             'ext':       ext,
1229             'title':     title,
1230             'thumbnail': thumbnail_url,
1231         }]
1232
1233
1234 def gen_extractors():
1235     """ Return a list of an instance of every supported extractor.
1236     The order does matter; the first extractor matched is the one handling the URL.
1237     """
1238     return [
1239         YoutubePlaylistIE(),
1240         YoutubeChannelIE(),
1241         YoutubeUserIE(),
1242         YoutubeSearchIE(),
1243         YoutubeIE(),
1244         MetacafeIE(),
1245         DailymotionIE(),
1246         GoogleSearchIE(),
1247         PhotobucketIE(),
1248         YahooIE(),
1249         YahooSearchIE(),
1250         DepositFilesIE(),
1251         FacebookIE(),
1252         BlipTVIE(),
1253         BlipTVUserIE(),
1254         VimeoIE(),
1255         MyVideoIE(),
1256         ComedyCentralIE(),
1257         EscapistIE(),
1258         CollegeHumorIE(),
1259         XVideosIE(),
1260         SoundcloudSetIE(),
1261         SoundcloudIE(),
1262         InfoQIE(),
1263         MixcloudIE(),
1264         StanfordOpenClassroomIE(),
1265         MTVIE(),
1266         YoukuIE(),
1267         XNXXIE(),
1268         YouJizzIE(),
1269         PornotubeIE(),
1270         YouPornIE(),
1271         GooglePlusIE(),
1272         ArteTvIE(),
1273         NBAIE(),
1274         WorldStarHipHopIE(),
1275         JustinTVIE(),
1276         FunnyOrDieIE(),
1277         SteamIE(),
1278         UstreamIE(),
1279         RBMARadioIE(),
1280         EightTracksIE(),
1281         KeekIE(),
1282         TEDIE(),
1283         MySpassIE(),
1284         SpiegelIE(),
1285         LiveLeakIE(),
1286         ARDIE(),
1287         ZDFIE(),
1288         TumblrIE(),
1289         BandcampIE(),
1290         RedTubeIE(),
1291         InaIE(),
1292         HowcastIE(),
1293         VineIE(),
1294         FlickrIE(),
1295         TeamcocoIE(),
1296         XHamsterIE(),
1297         HypemIE(),
1298         Vbox7IE(),
1299         GametrailersIE(),
1300         StatigramIE(),
1301         GenericIE()
1302     ]
1303
1304 def get_info_extractor(ie_name):
1305     """Returns the info extractor class with the given ie_name"""
1306     return globals()[ie_name+'IE']