_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.eighttracks import EightTracksIE
  29 from .extractor.escapist import EscapistIE
  30 from .extractor.facebook import FacebookIE
  31 from .extractor.funnyordie import FunnyOrDieIE
  32 from .extractor.gametrailers import GametrailersIE
  33 from .extractor.generic import GenericIE
  34 from .extractor.googleplus import GooglePlusIE
  35 from .extractor.googlesearch import GoogleSearchIE
  36 from .extractor.infoq import InfoQIE
  37 from .extractor.justintv import JustinTVIE
  38 from .extractor.metacafe import MetacafeIE
  39 from .extractor.mixcloud import MixcloudIE
  40 from .extractor.mtv import MTVIE
  41 from .extractor.myvideo import MyVideoIE
  42 from .extractor.nba import NBAIE
  43 from .extractor.statigram import StatigramIE
  44 from .extractor.photobucket import PhotobucketIE
  45 from .extractor.pornotube import PornotubeIE
  46 from .extractor.rbmaradio import RBMARadioIE
  47 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  48 from .extractor.stanfordoc import StanfordOpenClassroomIE
  49 from .extractor.steam import SteamIE
  50 from .extractor.ted import TEDIE
  51 from .extractor.ustream import UstreamIE
  52 from .extractor.vimeo import VimeoIE
  53 from .extractor.worldstarhiphop import WorldStarHipHopIE
  54 from .extractor.xnxx import XNXXIE
  55 from .extractor.xvideos import XVideosIE
  56 from .extractor.yahoo import YahooIE, YahooSearchIE
  57 from .extractor.youjizz import YouJizzIE
  58 from .extractor.youku import YoukuIE
  59 from .extractor.youporn import YouPornIE
  60 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  61 from .extractor.zdf import ZDFIE
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88 class KeekIE(InfoExtractor):
  89     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
  90     IE_NAME = u'keek'
  91
  92     def _real_extract(self, url):
  93         m = re.match(self._VALID_URL, url)
  94         video_id = m.group('videoID')
  95
  96         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
  97         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
  98         webpage = self._download_webpage(url, video_id)
  99
 100         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 101             webpage, u'title')
 102
 103         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 104             webpage, u'uploader', fatal=False)
 105
 106         info = {
 107                 'id': video_id,
 108                 'url': video_url,
 109                 'ext': 'mp4',
 110                 'title': video_title,
 111                 'thumbnail': thumbnail,
 112                 'uploader': uploader
 113         }
 114         return [info]
 115
 116
 117 class MySpassIE(InfoExtractor):
 118     _VALID_URL = r'http://www.myspass.de/.*'
 119
 120     def _real_extract(self, url):
 121         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 122
 123         # video id is the last path element of the URL
 124         # usually there is a trailing slash, so also try the second but last
 125         url_path = compat_urllib_parse_urlparse(url).path
 126         url_parent_path, video_id = os.path.split(url_path)
 127         if not video_id:
 128             _, video_id = os.path.split(url_parent_path)
 129
 130         # get metadata
 131         metadata_url = META_DATA_URL_TEMPLATE % video_id
 132         metadata_text = self._download_webpage(metadata_url, video_id)
 133         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 134
 135         # extract values from metadata
 136         url_flv_el = metadata.find('url_flv')
 137         if url_flv_el is None:
 138             raise ExtractorError(u'Unable to extract download url')
 139         video_url = url_flv_el.text
 140         extension = os.path.splitext(video_url)[1][1:]
 141         title_el = metadata.find('title')
 142         if title_el is None:
 143             raise ExtractorError(u'Unable to extract title')
 144         title = title_el.text
 145         format_id_el = metadata.find('format_id')
 146         if format_id_el is None:
 147             format = ext
 148         else:
 149             format = format_id_el.text
 150         description_el = metadata.find('description')
 151         if description_el is not None:
 152             description = description_el.text
 153         else:
 154             description = None
 155         imagePreview_el = metadata.find('imagePreview')
 156         if imagePreview_el is not None:
 157             thumbnail = imagePreview_el.text
 158         else:
 159             thumbnail = None
 160         info = {
 161             'id': video_id,
 162             'url': video_url,
 163             'title': title,
 164             'ext': extension,
 165             'format': format,
 166             'thumbnail': thumbnail,
 167             'description': description
 168         }
 169         return [info]
 170
 171 class SpiegelIE(InfoExtractor):
 172     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 173
 174     def _real_extract(self, url):
 175         m = re.match(self._VALID_URL, url)
 176         video_id = m.group('videoID')
 177
 178         webpage = self._download_webpage(url, video_id)
 179
 180         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
 181             webpage, u'title')
 182
 183         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 184         xml_code = self._download_webpage(xml_url, video_id,
 185                     note=u'Downloading XML', errnote=u'Failed to download XML')
 186
 187         idoc = xml.etree.ElementTree.fromstring(xml_code)
 188         last_type = idoc[-1]
 189         filename = last_type.findall('./filename')[0].text
 190         duration = float(last_type.findall('./duration')[0].text)
 191
 192         video_url = 'http://video2.spiegel.de/flash/' + filename
 193         video_ext = filename.rpartition('.')[2]
 194         info = {
 195             'id': video_id,
 196             'url': video_url,
 197             'ext': video_ext,
 198             'title': video_title,
 199             'duration': duration,
 200         }
 201         return [info]
 202
 203 class LiveLeakIE(InfoExtractor):
 204
 205     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 206     IE_NAME = u'liveleak'
 207
 208     def _real_extract(self, url):
 209         mobj = re.match(self._VALID_URL, url)
 210         if mobj is None:
 211             raise ExtractorError(u'Invalid URL: %s' % url)
 212
 213         video_id = mobj.group('video_id')
 214
 215         webpage = self._download_webpage(url, video_id)
 216
 217         video_url = self._search_regex(r'file: "(.*?)",',
 218             webpage, u'video URL')
 219
 220         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 221             webpage, u'title').replace('LiveLeak.com -', '').strip()
 222
 223         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 224             webpage, u'description', fatal=False)
 225
 226         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
 227             webpage, u'uploader', fatal=False)
 228
 229         info = {
 230             'id':  video_id,
 231             'url': video_url,
 232             'ext': 'mp4',
 233             'title': video_title,
 234             'description': video_description,
 235             'uploader': video_uploader
 236         }
 237
 238         return [info]
 239
 240
 241
 242 class TumblrIE(InfoExtractor):
 243     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 244
 245     def _real_extract(self, url):
 246         m_url = re.match(self._VALID_URL, url)
 247         video_id = m_url.group('id')
 248         blog = m_url.group('blog_name')
 249
 250         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
 251         webpage = self._download_webpage(url, video_id)
 252
 253         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
 254         video = re.search(re_video, webpage)
 255         if video is None:
 256            raise ExtractorError(u'Unable to extract video')
 257         video_url = video.group('video_url')
 258         ext = video.group('ext')
 259
 260         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
 261             webpage, u'thumbnail', fatal=False)  # We pick the first poster
 262         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 263
 264         # The only place where you can get a title, it's not complete,
 265         # but searching in other places doesn't work for all videos
 266         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
 267             webpage, u'title', flags=re.DOTALL)
 268
 269         return [{'id': video_id,
 270                  'url': video_url,
 271                  'title': video_title,
 272                  'thumbnail': video_thumbnail,
 273                  'ext': ext
 274                  }]
 275
 276 class BandcampIE(InfoExtractor):
 277     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
 278
 279     def _real_extract(self, url):
 280         mobj = re.match(self._VALID_URL, url)
 281         title = mobj.group('title')
 282         webpage = self._download_webpage(url, title)
 283         # We get the link to the free download page
 284         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
 285         if m_download is None:
 286             raise ExtractorError(u'No free songs found')
 287
 288         download_link = m_download.group(1)
 289         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
 290                        webpage, re.MULTILINE|re.DOTALL).group('id')
 291
 292         download_webpage = self._download_webpage(download_link, id,
 293                                                   'Downloading free downloads page')
 294         # We get the dictionary of the track from some javascrip code
 295         info = re.search(r'items: (.*?),$',
 296                          download_webpage, re.MULTILINE).group(1)
 297         info = json.loads(info)[0]
 298         # We pick mp3-320 for now, until format selection can be easily implemented.
 299         mp3_info = info[u'downloads'][u'mp3-320']
 300         # If we try to use this url it says the link has expired
 301         initial_url = mp3_info[u'url']
 302         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
 303         m_url = re.match(re_url, initial_url)
 304         #We build the url we will use to get the final track url
 305         # This url is build in Bandcamp in the script download_bunde_*.js
 306         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
 307         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
 308         # If we could correctly generate the .rand field the url would be
 309         #in the "download_url" key
 310         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
 311
 312         track_info = {'id':id,
 313                       'title' : info[u'title'],
 314                       'ext' :   'mp3',
 315                       'url' :   final_url,
 316                       'thumbnail' : info[u'thumb_url'],
 317                       'uploader' :  info[u'artist']
 318                       }
 319
 320         return [track_info]
 321
 322 class RedTubeIE(InfoExtractor):
 323     """Information Extractor for redtube"""
 324     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 325
 326     def _real_extract(self,url):
 327         mobj = re.match(self._VALID_URL, url)
 328         if mobj is None:
 329             raise ExtractorError(u'Invalid URL: %s' % url)
 330
 331         video_id = mobj.group('id')
 332         video_extension = 'mp4'
 333         webpage = self._download_webpage(url, video_id)
 334
 335         self.report_extraction(video_id)
 336
 337         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 338             webpage, u'video URL')
 339
 340         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 341             webpage, u'title')
 342
 343         return [{
 344             'id':       video_id,
 345             'url':      video_url,
 346             'ext':      video_extension,
 347             'title':    video_title,
 348         }]
 349
 350 class InaIE(InfoExtractor):
 351     """Information Extractor for Ina.fr"""
 352     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 353
 354     def _real_extract(self,url):
 355         mobj = re.match(self._VALID_URL, url)
 356
 357         video_id = mobj.group('id')
 358         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 359         video_extension = 'mp4'
 360         webpage = self._download_webpage(mrss_url, video_id)
 361
 362         self.report_extraction(video_id)
 363
 364         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 365             webpage, u'video URL')
 366
 367         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 368             webpage, u'title')
 369
 370         return [{
 371             'id':       video_id,
 372             'url':      video_url,
 373             'ext':      video_extension,
 374             'title':    video_title,
 375         }]
 376
 377 class HowcastIE(InfoExtractor):
 378     """Information Extractor for Howcast.com"""
 379     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 380
 381     def _real_extract(self, url):
 382         mobj = re.match(self._VALID_URL, url)
 383
 384         video_id = mobj.group('id')
 385         webpage_url = 'http://www.howcast.com/videos/' + video_id
 386         webpage = self._download_webpage(webpage_url, video_id)
 387
 388         self.report_extraction(video_id)
 389
 390         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 391             webpage, u'video URL')
 392
 393         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 394             webpage, u'title')
 395
 396         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 397             webpage, u'description', fatal=False)
 398
 399         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 400             webpage, u'thumbnail', fatal=False)
 401
 402         return [{
 403             'id':       video_id,
 404             'url':      video_url,
 405             'ext':      'mp4',
 406             'title':    video_title,
 407             'description': video_description,
 408             'thumbnail': thumbnail,
 409         }]
 410
 411 class VineIE(InfoExtractor):
 412     """Information Extractor for Vine.co"""
 413     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 414
 415     def _real_extract(self, url):
 416         mobj = re.match(self._VALID_URL, url)
 417
 418         video_id = mobj.group('id')
 419         webpage_url = 'https://vine.co/v/' + video_id
 420         webpage = self._download_webpage(webpage_url, video_id)
 421
 422         self.report_extraction(video_id)
 423
 424         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
 425             webpage, u'video URL')
 426
 427         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 428             webpage, u'title')
 429
 430         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
 431             webpage, u'thumbnail', fatal=False)
 432
 433         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
 434             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 435
 436         return [{
 437             'id':        video_id,
 438             'url':       video_url,
 439             'ext':       'mp4',
 440             'title':     video_title,
 441             'thumbnail': thumbnail,
 442             'uploader':  uploader,
 443         }]
 444
 445 class FlickrIE(InfoExtractor):
 446     """Information Extractor for Flickr videos"""
 447     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 448
 449     def _real_extract(self, url):
 450         mobj = re.match(self._VALID_URL, url)
 451
 452         video_id = mobj.group('id')
 453         video_uploader_id = mobj.group('uploader_id')
 454         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 455         webpage = self._download_webpage(webpage_url, video_id)
 456
 457         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 458
 459         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 460         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 461
 462         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 463             first_xml, u'node_id')
 464
 465         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 466         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 467
 468         self.report_extraction(video_id)
 469
 470         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 471         if mobj is None:
 472             raise ExtractorError(u'Unable to extract video url')
 473         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 474
 475         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 476             webpage, u'video title')
 477
 478         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 479             webpage, u'description', fatal=False)
 480
 481         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 482             webpage, u'thumbnail', fatal=False)
 483
 484         return [{
 485             'id':          video_id,
 486             'url':         video_url,
 487             'ext':         'mp4',
 488             'title':       video_title,
 489             'description': video_description,
 490             'thumbnail':   thumbnail,
 491             'uploader_id': video_uploader_id,
 492         }]
 493
 494 class TeamcocoIE(InfoExtractor):
 495     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 496
 497     def _real_extract(self, url):
 498         mobj = re.match(self._VALID_URL, url)
 499         if mobj is None:
 500             raise ExtractorError(u'Invalid URL: %s' % url)
 501         url_title = mobj.group('url_title')
 502         webpage = self._download_webpage(url, url_title)
 503
 504         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 505             webpage, u'video id')
 506
 507         self.report_extraction(video_id)
 508
 509         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 510             webpage, u'title')
 511
 512         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 513             webpage, u'thumbnail', fatal=False)
 514
 515         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 516             webpage, u'description', fatal=False)
 517
 518         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 519         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 520
 521         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 522             data, u'video URL')
 523
 524         return [{
 525             'id':          video_id,
 526             'url':         video_url,
 527             'ext':         'mp4',
 528             'title':       video_title,
 529             'thumbnail':   thumbnail,
 530             'description': video_description,
 531         }]
 532
 533 class XHamsterIE(InfoExtractor):
 534     """Information Extractor for xHamster"""
 535     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 536
 537     def _real_extract(self,url):
 538         mobj = re.match(self._VALID_URL, url)
 539
 540         video_id = mobj.group('id')
 541         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 542         webpage = self._download_webpage(mrss_url, video_id)
 543
 544         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 545         if mobj is None:
 546             raise ExtractorError(u'Unable to extract media URL')
 547         if len(mobj.group('server')) == 0:
 548             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 549         else:
 550             video_url = mobj.group('server')+'/key='+mobj.group('file')
 551         video_extension = video_url.split('.')[-1]
 552
 553         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 554             webpage, u'title')
 555
 556         # Can't see the description anywhere in the UI
 557         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 558         #     webpage, u'description', fatal=False)
 559         # if video_description: video_description = unescapeHTML(video_description)
 560
 561         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 562         if mobj:
 563             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 564         else:
 565             video_upload_date = None
 566             self._downloader.report_warning(u'Unable to extract upload date')
 567
 568         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 569             webpage, u'uploader id', default=u'anonymous')
 570
 571         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 572             webpage, u'thumbnail', fatal=False)
 573
 574         return [{
 575             'id':       video_id,
 576             'url':      video_url,
 577             'ext':      video_extension,
 578             'title':    video_title,
 579             # 'description': video_description,
 580             'upload_date': video_upload_date,
 581             'uploader_id': video_uploader_id,
 582             'thumbnail': video_thumbnail
 583         }]
 584
 585 class HypemIE(InfoExtractor):
 586     """Information Extractor for hypem"""
 587     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 588
 589     def _real_extract(self, url):
 590         mobj = re.match(self._VALID_URL, url)
 591         if mobj is None:
 592             raise ExtractorError(u'Invalid URL: %s' % url)
 593         track_id = mobj.group(1)
 594
 595         data = { 'ax': 1, 'ts': time.time() }
 596         data_encoded = compat_urllib_parse.urlencode(data)
 597         complete_url = url + "?" + data_encoded
 598         request = compat_urllib_request.Request(complete_url)
 599         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 600         cookie = urlh.headers.get('Set-Cookie', '')
 601
 602         self.report_extraction(track_id)
 603
 604         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 605             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 606         try:
 607             track_list = json.loads(html_tracks)
 608             track = track_list[u'tracks'][0]
 609         except ValueError:
 610             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 611
 612         key = track[u"key"]
 613         track_id = track[u"id"]
 614         artist = track[u"artist"]
 615         title = track[u"song"]
 616
 617         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 618         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 619         request.add_header('cookie', cookie)
 620         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 621         try:
 622             song_data = json.loads(song_data_json)
 623         except ValueError:
 624             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 625         final_url = song_data[u"url"]
 626
 627         return [{
 628             'id':       track_id,
 629             'url':      final_url,
 630             'ext':      "mp3",
 631             'title':    title,
 632             'artist':   artist,
 633         }]
 634
 635 class Vbox7IE(InfoExtractor):
 636     """Information Extractor for Vbox7"""
 637     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
 638
 639     def _real_extract(self,url):
 640         mobj = re.match(self._VALID_URL, url)
 641         if mobj is None:
 642             raise ExtractorError(u'Invalid URL: %s' % url)
 643         video_id = mobj.group(1)
 644
 645         redirect_page, urlh = self._download_webpage_handle(url, video_id)
 646         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
 647         redirect_url = urlh.geturl() + new_location
 648         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
 649
 650         title = self._html_search_regex(r'<title>(.*)</title>',
 651             webpage, u'title').split('/')[0].strip()
 652
 653         ext = "flv"
 654         info_url = "http://vbox7.com/play/magare.do"
 655         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
 656         info_request = compat_urllib_request.Request(info_url, data)
 657         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 658         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
 659         if info_response is None:
 660             raise ExtractorError(u'Unable to extract the media url')
 661         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
 662
 663         return [{
 664             'id':        video_id,
 665             'url':       final_url,
 666             'ext':       ext,
 667             'title':     title,
 668             'thumbnail': thumbnail_url,
 669         }]
 670
 671
 672 def gen_extractors():
 673     """ Return a list of an instance of every supported extractor.
 674     The order does matter; the first extractor matched is the one handling the URL.
 675     """
 676     return [
 677         YoutubePlaylistIE(),
 678         YoutubeChannelIE(),
 679         YoutubeUserIE(),
 680         YoutubeSearchIE(),
 681         YoutubeIE(),
 682         MetacafeIE(),
 683         DailymotionIE(),
 684         GoogleSearchIE(),
 685         PhotobucketIE(),
 686         YahooIE(),
 687         YahooSearchIE(),
 688         DepositFilesIE(),
 689         FacebookIE(),
 690         BlipTVIE(),
 691         BlipTVUserIE(),
 692         VimeoIE(),
 693         MyVideoIE(),
 694         ComedyCentralIE(),
 695         EscapistIE(),
 696         CollegeHumorIE(),
 697         XVideosIE(),
 698         SoundcloudSetIE(),
 699         SoundcloudIE(),
 700         InfoQIE(),
 701         MixcloudIE(),
 702         StanfordOpenClassroomIE(),
 703         MTVIE(),
 704         YoukuIE(),
 705         XNXXIE(),
 706         YouJizzIE(),
 707         PornotubeIE(),
 708         YouPornIE(),
 709         GooglePlusIE(),
 710         ArteTvIE(),
 711         NBAIE(),
 712         WorldStarHipHopIE(),
 713         JustinTVIE(),
 714         FunnyOrDieIE(),
 715         SteamIE(),
 716         UstreamIE(),
 717         RBMARadioIE(),
 718         EightTracksIE(),
 719         KeekIE(),
 720         TEDIE(),
 721         MySpassIE(),
 722         SpiegelIE(),
 723         LiveLeakIE(),
 724         ARDIE(),
 725         ZDFIE(),
 726         TumblrIE(),
 727         BandcampIE(),
 728         RedTubeIE(),
 729         InaIE(),
 730         HowcastIE(),
 731         VineIE(),
 732         FlickrIE(),
 733         TeamcocoIE(),
 734         XHamsterIE(),
 735         HypemIE(),
 736         Vbox7IE(),
 737         GametrailersIE(),
 738         StatigramIE(),
 739         GenericIE()
 740     ]
 741
 742 def get_info_extractor(ie_name):
 743     """Returns the info extractor class with the given ie_name"""
 744     return globals()[ie_name+'IE']