_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.eighttracks import EightTracksIE
  29 from .extractor.escapist import EscapistIE
  30 from .extractor.facebook import FacebookIE
  31 from .extractor.funnyordie import FunnyOrDieIE
  32 from .extractor.gametrailers import GametrailersIE
  33 from .extractor.generic import GenericIE
  34 from .extractor.googleplus import GooglePlusIE
  35 from .extractor.googlesearch import GoogleSearchIE
  36 from .extractor.infoq import InfoQIE
  37 from .extractor.justintv import JustinTVIE
  38 from .extractor.keek import KeekIE
  39 from .extractor.metacafe import MetacafeIE
  40 from .extractor.mixcloud import MixcloudIE
  41 from .extractor.mtv import MTVIE
  42 from .extractor.myvideo import MyVideoIE
  43 from .extractor.nba import NBAIE
  44 from .extractor.statigram import StatigramIE
  45 from .extractor.photobucket import PhotobucketIE
  46 from .extractor.pornotube import PornotubeIE
  47 from .extractor.rbmaradio import RBMARadioIE
  48 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  49 from .extractor.stanfordoc import StanfordOpenClassroomIE
  50 from .extractor.steam import SteamIE
  51 from .extractor.ted import TEDIE
  52 from .extractor.ustream import UstreamIE
  53 from .extractor.vimeo import VimeoIE
  54 from .extractor.worldstarhiphop import WorldStarHipHopIE
  55 from .extractor.xnxx import XNXXIE
  56 from .extractor.xvideos import XVideosIE
  57 from .extractor.yahoo import YahooIE, YahooSearchIE
  58 from .extractor.youjizz import YouJizzIE
  59 from .extractor.youku import YoukuIE
  60 from .extractor.youporn import YouPornIE
  61 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  62 from .extractor.zdf import ZDFIE
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91 class MySpassIE(InfoExtractor):
  92     _VALID_URL = r'http://www.myspass.de/.*'
  93
  94     def _real_extract(self, url):
  95         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
  96
  97         # video id is the last path element of the URL
  98         # usually there is a trailing slash, so also try the second but last
  99         url_path = compat_urllib_parse_urlparse(url).path
 100         url_parent_path, video_id = os.path.split(url_path)
 101         if not video_id:
 102             _, video_id = os.path.split(url_parent_path)
 103
 104         # get metadata
 105         metadata_url = META_DATA_URL_TEMPLATE % video_id
 106         metadata_text = self._download_webpage(metadata_url, video_id)
 107         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 108
 109         # extract values from metadata
 110         url_flv_el = metadata.find('url_flv')
 111         if url_flv_el is None:
 112             raise ExtractorError(u'Unable to extract download url')
 113         video_url = url_flv_el.text
 114         extension = os.path.splitext(video_url)[1][1:]
 115         title_el = metadata.find('title')
 116         if title_el is None:
 117             raise ExtractorError(u'Unable to extract title')
 118         title = title_el.text
 119         format_id_el = metadata.find('format_id')
 120         if format_id_el is None:
 121             format = ext
 122         else:
 123             format = format_id_el.text
 124         description_el = metadata.find('description')
 125         if description_el is not None:
 126             description = description_el.text
 127         else:
 128             description = None
 129         imagePreview_el = metadata.find('imagePreview')
 130         if imagePreview_el is not None:
 131             thumbnail = imagePreview_el.text
 132         else:
 133             thumbnail = None
 134         info = {
 135             'id': video_id,
 136             'url': video_url,
 137             'title': title,
 138             'ext': extension,
 139             'format': format,
 140             'thumbnail': thumbnail,
 141             'description': description
 142         }
 143         return [info]
 144
 145 class SpiegelIE(InfoExtractor):
 146     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 147
 148     def _real_extract(self, url):
 149         m = re.match(self._VALID_URL, url)
 150         video_id = m.group('videoID')
 151
 152         webpage = self._download_webpage(url, video_id)
 153
 154         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
 155             webpage, u'title')
 156
 157         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 158         xml_code = self._download_webpage(xml_url, video_id,
 159                     note=u'Downloading XML', errnote=u'Failed to download XML')
 160
 161         idoc = xml.etree.ElementTree.fromstring(xml_code)
 162         last_type = idoc[-1]
 163         filename = last_type.findall('./filename')[0].text
 164         duration = float(last_type.findall('./duration')[0].text)
 165
 166         video_url = 'http://video2.spiegel.de/flash/' + filename
 167         video_ext = filename.rpartition('.')[2]
 168         info = {
 169             'id': video_id,
 170             'url': video_url,
 171             'ext': video_ext,
 172             'title': video_title,
 173             'duration': duration,
 174         }
 175         return [info]
 176
 177 class LiveLeakIE(InfoExtractor):
 178
 179     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 180     IE_NAME = u'liveleak'
 181
 182     def _real_extract(self, url):
 183         mobj = re.match(self._VALID_URL, url)
 184         if mobj is None:
 185             raise ExtractorError(u'Invalid URL: %s' % url)
 186
 187         video_id = mobj.group('video_id')
 188
 189         webpage = self._download_webpage(url, video_id)
 190
 191         video_url = self._search_regex(r'file: "(.*?)",',
 192             webpage, u'video URL')
 193
 194         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 195             webpage, u'title').replace('LiveLeak.com -', '').strip()
 196
 197         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 198             webpage, u'description', fatal=False)
 199
 200         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
 201             webpage, u'uploader', fatal=False)
 202
 203         info = {
 204             'id':  video_id,
 205             'url': video_url,
 206             'ext': 'mp4',
 207             'title': video_title,
 208             'description': video_description,
 209             'uploader': video_uploader
 210         }
 211
 212         return [info]
 213
 214
 215
 216 class TumblrIE(InfoExtractor):
 217     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 218
 219     def _real_extract(self, url):
 220         m_url = re.match(self._VALID_URL, url)
 221         video_id = m_url.group('id')
 222         blog = m_url.group('blog_name')
 223
 224         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
 225         webpage = self._download_webpage(url, video_id)
 226
 227         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
 228         video = re.search(re_video, webpage)
 229         if video is None:
 230            raise ExtractorError(u'Unable to extract video')
 231         video_url = video.group('video_url')
 232         ext = video.group('ext')
 233
 234         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
 235             webpage, u'thumbnail', fatal=False)  # We pick the first poster
 236         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 237
 238         # The only place where you can get a title, it's not complete,
 239         # but searching in other places doesn't work for all videos
 240         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
 241             webpage, u'title', flags=re.DOTALL)
 242
 243         return [{'id': video_id,
 244                  'url': video_url,
 245                  'title': video_title,
 246                  'thumbnail': video_thumbnail,
 247                  'ext': ext
 248                  }]
 249
 250 class BandcampIE(InfoExtractor):
 251     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
 252
 253     def _real_extract(self, url):
 254         mobj = re.match(self._VALID_URL, url)
 255         title = mobj.group('title')
 256         webpage = self._download_webpage(url, title)
 257         # We get the link to the free download page
 258         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
 259         if m_download is None:
 260             raise ExtractorError(u'No free songs found')
 261
 262         download_link = m_download.group(1)
 263         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
 264                        webpage, re.MULTILINE|re.DOTALL).group('id')
 265
 266         download_webpage = self._download_webpage(download_link, id,
 267                                                   'Downloading free downloads page')
 268         # We get the dictionary of the track from some javascrip code
 269         info = re.search(r'items: (.*?),$',
 270                          download_webpage, re.MULTILINE).group(1)
 271         info = json.loads(info)[0]
 272         # We pick mp3-320 for now, until format selection can be easily implemented.
 273         mp3_info = info[u'downloads'][u'mp3-320']
 274         # If we try to use this url it says the link has expired
 275         initial_url = mp3_info[u'url']
 276         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
 277         m_url = re.match(re_url, initial_url)
 278         #We build the url we will use to get the final track url
 279         # This url is build in Bandcamp in the script download_bunde_*.js
 280         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
 281         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
 282         # If we could correctly generate the .rand field the url would be
 283         #in the "download_url" key
 284         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
 285
 286         track_info = {'id':id,
 287                       'title' : info[u'title'],
 288                       'ext' :   'mp3',
 289                       'url' :   final_url,
 290                       'thumbnail' : info[u'thumb_url'],
 291                       'uploader' :  info[u'artist']
 292                       }
 293
 294         return [track_info]
 295
 296 class RedTubeIE(InfoExtractor):
 297     """Information Extractor for redtube"""
 298     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 299
 300     def _real_extract(self,url):
 301         mobj = re.match(self._VALID_URL, url)
 302         if mobj is None:
 303             raise ExtractorError(u'Invalid URL: %s' % url)
 304
 305         video_id = mobj.group('id')
 306         video_extension = 'mp4'
 307         webpage = self._download_webpage(url, video_id)
 308
 309         self.report_extraction(video_id)
 310
 311         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 312             webpage, u'video URL')
 313
 314         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 315             webpage, u'title')
 316
 317         return [{
 318             'id':       video_id,
 319             'url':      video_url,
 320             'ext':      video_extension,
 321             'title':    video_title,
 322         }]
 323
 324 class InaIE(InfoExtractor):
 325     """Information Extractor for Ina.fr"""
 326     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 327
 328     def _real_extract(self,url):
 329         mobj = re.match(self._VALID_URL, url)
 330
 331         video_id = mobj.group('id')
 332         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 333         video_extension = 'mp4'
 334         webpage = self._download_webpage(mrss_url, video_id)
 335
 336         self.report_extraction(video_id)
 337
 338         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 339             webpage, u'video URL')
 340
 341         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 342             webpage, u'title')
 343
 344         return [{
 345             'id':       video_id,
 346             'url':      video_url,
 347             'ext':      video_extension,
 348             'title':    video_title,
 349         }]
 350
 351 class HowcastIE(InfoExtractor):
 352     """Information Extractor for Howcast.com"""
 353     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 354
 355     def _real_extract(self, url):
 356         mobj = re.match(self._VALID_URL, url)
 357
 358         video_id = mobj.group('id')
 359         webpage_url = 'http://www.howcast.com/videos/' + video_id
 360         webpage = self._download_webpage(webpage_url, video_id)
 361
 362         self.report_extraction(video_id)
 363
 364         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 365             webpage, u'video URL')
 366
 367         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 368             webpage, u'title')
 369
 370         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 371             webpage, u'description', fatal=False)
 372
 373         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 374             webpage, u'thumbnail', fatal=False)
 375
 376         return [{
 377             'id':       video_id,
 378             'url':      video_url,
 379             'ext':      'mp4',
 380             'title':    video_title,
 381             'description': video_description,
 382             'thumbnail': thumbnail,
 383         }]
 384
 385 class VineIE(InfoExtractor):
 386     """Information Extractor for Vine.co"""
 387     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 388
 389     def _real_extract(self, url):
 390         mobj = re.match(self._VALID_URL, url)
 391
 392         video_id = mobj.group('id')
 393         webpage_url = 'https://vine.co/v/' + video_id
 394         webpage = self._download_webpage(webpage_url, video_id)
 395
 396         self.report_extraction(video_id)
 397
 398         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
 399             webpage, u'video URL')
 400
 401         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 402             webpage, u'title')
 403
 404         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
 405             webpage, u'thumbnail', fatal=False)
 406
 407         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
 408             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 409
 410         return [{
 411             'id':        video_id,
 412             'url':       video_url,
 413             'ext':       'mp4',
 414             'title':     video_title,
 415             'thumbnail': thumbnail,
 416             'uploader':  uploader,
 417         }]
 418
 419 class FlickrIE(InfoExtractor):
 420     """Information Extractor for Flickr videos"""
 421     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 422
 423     def _real_extract(self, url):
 424         mobj = re.match(self._VALID_URL, url)
 425
 426         video_id = mobj.group('id')
 427         video_uploader_id = mobj.group('uploader_id')
 428         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 429         webpage = self._download_webpage(webpage_url, video_id)
 430
 431         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 432
 433         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 434         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 435
 436         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 437             first_xml, u'node_id')
 438
 439         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 440         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 441
 442         self.report_extraction(video_id)
 443
 444         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 445         if mobj is None:
 446             raise ExtractorError(u'Unable to extract video url')
 447         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 448
 449         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 450             webpage, u'video title')
 451
 452         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 453             webpage, u'description', fatal=False)
 454
 455         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 456             webpage, u'thumbnail', fatal=False)
 457
 458         return [{
 459             'id':          video_id,
 460             'url':         video_url,
 461             'ext':         'mp4',
 462             'title':       video_title,
 463             'description': video_description,
 464             'thumbnail':   thumbnail,
 465             'uploader_id': video_uploader_id,
 466         }]
 467
 468 class TeamcocoIE(InfoExtractor):
 469     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 470
 471     def _real_extract(self, url):
 472         mobj = re.match(self._VALID_URL, url)
 473         if mobj is None:
 474             raise ExtractorError(u'Invalid URL: %s' % url)
 475         url_title = mobj.group('url_title')
 476         webpage = self._download_webpage(url, url_title)
 477
 478         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 479             webpage, u'video id')
 480
 481         self.report_extraction(video_id)
 482
 483         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 484             webpage, u'title')
 485
 486         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 487             webpage, u'thumbnail', fatal=False)
 488
 489         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 490             webpage, u'description', fatal=False)
 491
 492         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 493         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 494
 495         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 496             data, u'video URL')
 497
 498         return [{
 499             'id':          video_id,
 500             'url':         video_url,
 501             'ext':         'mp4',
 502             'title':       video_title,
 503             'thumbnail':   thumbnail,
 504             'description': video_description,
 505         }]
 506
 507 class XHamsterIE(InfoExtractor):
 508     """Information Extractor for xHamster"""
 509     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 510
 511     def _real_extract(self,url):
 512         mobj = re.match(self._VALID_URL, url)
 513
 514         video_id = mobj.group('id')
 515         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 516         webpage = self._download_webpage(mrss_url, video_id)
 517
 518         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 519         if mobj is None:
 520             raise ExtractorError(u'Unable to extract media URL')
 521         if len(mobj.group('server')) == 0:
 522             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 523         else:
 524             video_url = mobj.group('server')+'/key='+mobj.group('file')
 525         video_extension = video_url.split('.')[-1]
 526
 527         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 528             webpage, u'title')
 529
 530         # Can't see the description anywhere in the UI
 531         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 532         #     webpage, u'description', fatal=False)
 533         # if video_description: video_description = unescapeHTML(video_description)
 534
 535         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 536         if mobj:
 537             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 538         else:
 539             video_upload_date = None
 540             self._downloader.report_warning(u'Unable to extract upload date')
 541
 542         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 543             webpage, u'uploader id', default=u'anonymous')
 544
 545         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 546             webpage, u'thumbnail', fatal=False)
 547
 548         return [{
 549             'id':       video_id,
 550             'url':      video_url,
 551             'ext':      video_extension,
 552             'title':    video_title,
 553             # 'description': video_description,
 554             'upload_date': video_upload_date,
 555             'uploader_id': video_uploader_id,
 556             'thumbnail': video_thumbnail
 557         }]
 558
 559 class HypemIE(InfoExtractor):
 560     """Information Extractor for hypem"""
 561     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 562
 563     def _real_extract(self, url):
 564         mobj = re.match(self._VALID_URL, url)
 565         if mobj is None:
 566             raise ExtractorError(u'Invalid URL: %s' % url)
 567         track_id = mobj.group(1)
 568
 569         data = { 'ax': 1, 'ts': time.time() }
 570         data_encoded = compat_urllib_parse.urlencode(data)
 571         complete_url = url + "?" + data_encoded
 572         request = compat_urllib_request.Request(complete_url)
 573         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 574         cookie = urlh.headers.get('Set-Cookie', '')
 575
 576         self.report_extraction(track_id)
 577
 578         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 579             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 580         try:
 581             track_list = json.loads(html_tracks)
 582             track = track_list[u'tracks'][0]
 583         except ValueError:
 584             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 585
 586         key = track[u"key"]
 587         track_id = track[u"id"]
 588         artist = track[u"artist"]
 589         title = track[u"song"]
 590
 591         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 592         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 593         request.add_header('cookie', cookie)
 594         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 595         try:
 596             song_data = json.loads(song_data_json)
 597         except ValueError:
 598             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 599         final_url = song_data[u"url"]
 600
 601         return [{
 602             'id':       track_id,
 603             'url':      final_url,
 604             'ext':      "mp3",
 605             'title':    title,
 606             'artist':   artist,
 607         }]
 608
 609 class Vbox7IE(InfoExtractor):
 610     """Information Extractor for Vbox7"""
 611     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
 612
 613     def _real_extract(self,url):
 614         mobj = re.match(self._VALID_URL, url)
 615         if mobj is None:
 616             raise ExtractorError(u'Invalid URL: %s' % url)
 617         video_id = mobj.group(1)
 618
 619         redirect_page, urlh = self._download_webpage_handle(url, video_id)
 620         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
 621         redirect_url = urlh.geturl() + new_location
 622         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
 623
 624         title = self._html_search_regex(r'<title>(.*)</title>',
 625             webpage, u'title').split('/')[0].strip()
 626
 627         ext = "flv"
 628         info_url = "http://vbox7.com/play/magare.do"
 629         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
 630         info_request = compat_urllib_request.Request(info_url, data)
 631         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 632         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
 633         if info_response is None:
 634             raise ExtractorError(u'Unable to extract the media url')
 635         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
 636
 637         return [{
 638             'id':        video_id,
 639             'url':       final_url,
 640             'ext':       ext,
 641             'title':     title,
 642             'thumbnail': thumbnail_url,
 643         }]
 644
 645
 646 def gen_extractors():
 647     """ Return a list of an instance of every supported extractor.
 648     The order does matter; the first extractor matched is the one handling the URL.
 649     """
 650     return [
 651         YoutubePlaylistIE(),
 652         YoutubeChannelIE(),
 653         YoutubeUserIE(),
 654         YoutubeSearchIE(),
 655         YoutubeIE(),
 656         MetacafeIE(),
 657         DailymotionIE(),
 658         GoogleSearchIE(),
 659         PhotobucketIE(),
 660         YahooIE(),
 661         YahooSearchIE(),
 662         DepositFilesIE(),
 663         FacebookIE(),
 664         BlipTVIE(),
 665         BlipTVUserIE(),
 666         VimeoIE(),
 667         MyVideoIE(),
 668         ComedyCentralIE(),
 669         EscapistIE(),
 670         CollegeHumorIE(),
 671         XVideosIE(),
 672         SoundcloudSetIE(),
 673         SoundcloudIE(),
 674         InfoQIE(),
 675         MixcloudIE(),
 676         StanfordOpenClassroomIE(),
 677         MTVIE(),
 678         YoukuIE(),
 679         XNXXIE(),
 680         YouJizzIE(),
 681         PornotubeIE(),
 682         YouPornIE(),
 683         GooglePlusIE(),
 684         ArteTvIE(),
 685         NBAIE(),
 686         WorldStarHipHopIE(),
 687         JustinTVIE(),
 688         FunnyOrDieIE(),
 689         SteamIE(),
 690         UstreamIE(),
 691         RBMARadioIE(),
 692         EightTracksIE(),
 693         KeekIE(),
 694         TEDIE(),
 695         MySpassIE(),
 696         SpiegelIE(),
 697         LiveLeakIE(),
 698         ARDIE(),
 699         ZDFIE(),
 700         TumblrIE(),
 701         BandcampIE(),
 702         RedTubeIE(),
 703         InaIE(),
 704         HowcastIE(),
 705         VineIE(),
 706         FlickrIE(),
 707         TeamcocoIE(),
 708         XHamsterIE(),
 709         HypemIE(),
 710         Vbox7IE(),
 711         GametrailersIE(),
 712         StatigramIE(),
 713         GenericIE()
 714     ]
 715
 716 def get_info_extractor(ie_name):
 717     """Returns the info extractor class with the given ie_name"""
 718     return globals()[ie_name+'IE']