_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.funnyordie import FunnyOrDieIE
  31 from .extractor.gametrailers import GametrailersIE
  32 from .extractor.generic import GenericIE
  33 from .extractor.googleplus import GooglePlusIE
  34 from .extractor.googlesearch import GoogleSearchIE
  35 from .extractor.infoq import InfoQIE
  36 from .extractor.justintv import JustinTVIE
  37 from .extractor.metacafe import MetacafeIE
  38 from .extractor.mixcloud import MixcloudIE
  39 from .extractor.mtv import MTVIE
  40 from .extractor.myvideo import MyVideoIE
  41 from .extractor.nba import NBAIE
  42 from .extractor.statigram import StatigramIE
  43 from .extractor.photobucket import PhotobucketIE
  44 from .extractor.pornotube import PornotubeIE
  45 from .extractor.rbmaradio import RBMARadioIE
  46 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  47 from .extractor.stanfordoc import StanfordOpenClassroomIE
  48 from .extractor.steam import SteamIE
  49 from .extractor.ted import TEDIE
  50 from .extractor.ustream import UstreamIE
  51 from .extractor.vimeo import VimeoIE
  52 from .extractor.worldstarhiphop import WorldStarHipHopIE
  53 from .extractor.xnxx import XNXXIE
  54 from .extractor.xvideos import XVideosIE
  55 from .extractor.yahoo import YahooIE, YahooSearchIE
  56 from .extractor.youjizz import YouJizzIE
  57 from .extractor.youku import YoukuIE
  58 from .extractor.youporn import YouPornIE
  59 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  60 from .extractor.zdf import ZDFIE
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86 class EightTracksIE(InfoExtractor):
  87     IE_NAME = '8tracks'
  88     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
  89
  90     def _real_extract(self, url):
  91         mobj = re.match(self._VALID_URL, url)
  92         if mobj is None:
  93             raise ExtractorError(u'Invalid URL: %s' % url)
  94         playlist_id = mobj.group('id')
  95
  96         webpage = self._download_webpage(url, playlist_id)
  97
  98         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
  99         data = json.loads(json_like)
 100
 101         session = str(random.randint(0, 1000000000))
 102         mix_id = data['id']
 103         track_count = data['tracks_count']
 104         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 105         next_url = first_url
 106         res = []
 107         for i in itertools.count():
 108             api_json = self._download_webpage(next_url, playlist_id,
 109                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 110                 errnote=u'Failed to download song information')
 111             api_data = json.loads(api_json)
 112             track_data = api_data[u'set']['track']
 113             info = {
 114                 'id': track_data['id'],
 115                 'url': track_data['track_file_stream_url'],
 116                 'title': track_data['performer'] + u' - ' + track_data['name'],
 117                 'raw_title': track_data['name'],
 118                 'uploader_id': data['user']['login'],
 119                 'ext': 'm4a',
 120             }
 121             res.append(info)
 122             if api_data['set']['at_last_track']:
 123                 break
 124             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 125         return res
 126
 127 class KeekIE(InfoExtractor):
 128     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 129     IE_NAME = u'keek'
 130
 131     def _real_extract(self, url):
 132         m = re.match(self._VALID_URL, url)
 133         video_id = m.group('videoID')
 134
 135         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 136         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 137         webpage = self._download_webpage(url, video_id)
 138
 139         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 140             webpage, u'title')
 141
 142         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 143             webpage, u'uploader', fatal=False)
 144
 145         info = {
 146                 'id': video_id,
 147                 'url': video_url,
 148                 'ext': 'mp4',
 149                 'title': video_title,
 150                 'thumbnail': thumbnail,
 151                 'uploader': uploader
 152         }
 153         return [info]
 154
 155
 156 class MySpassIE(InfoExtractor):
 157     _VALID_URL = r'http://www.myspass.de/.*'
 158
 159     def _real_extract(self, url):
 160         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 161
 162         # video id is the last path element of the URL
 163         # usually there is a trailing slash, so also try the second but last
 164         url_path = compat_urllib_parse_urlparse(url).path
 165         url_parent_path, video_id = os.path.split(url_path)
 166         if not video_id:
 167             _, video_id = os.path.split(url_parent_path)
 168
 169         # get metadata
 170         metadata_url = META_DATA_URL_TEMPLATE % video_id
 171         metadata_text = self._download_webpage(metadata_url, video_id)
 172         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 173
 174         # extract values from metadata
 175         url_flv_el = metadata.find('url_flv')
 176         if url_flv_el is None:
 177             raise ExtractorError(u'Unable to extract download url')
 178         video_url = url_flv_el.text
 179         extension = os.path.splitext(video_url)[1][1:]
 180         title_el = metadata.find('title')
 181         if title_el is None:
 182             raise ExtractorError(u'Unable to extract title')
 183         title = title_el.text
 184         format_id_el = metadata.find('format_id')
 185         if format_id_el is None:
 186             format = ext
 187         else:
 188             format = format_id_el.text
 189         description_el = metadata.find('description')
 190         if description_el is not None:
 191             description = description_el.text
 192         else:
 193             description = None
 194         imagePreview_el = metadata.find('imagePreview')
 195         if imagePreview_el is not None:
 196             thumbnail = imagePreview_el.text
 197         else:
 198             thumbnail = None
 199         info = {
 200             'id': video_id,
 201             'url': video_url,
 202             'title': title,
 203             'ext': extension,
 204             'format': format,
 205             'thumbnail': thumbnail,
 206             'description': description
 207         }
 208         return [info]
 209
 210 class SpiegelIE(InfoExtractor):
 211     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 212
 213     def _real_extract(self, url):
 214         m = re.match(self._VALID_URL, url)
 215         video_id = m.group('videoID')
 216
 217         webpage = self._download_webpage(url, video_id)
 218
 219         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
 220             webpage, u'title')
 221
 222         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 223         xml_code = self._download_webpage(xml_url, video_id,
 224                     note=u'Downloading XML', errnote=u'Failed to download XML')
 225
 226         idoc = xml.etree.ElementTree.fromstring(xml_code)
 227         last_type = idoc[-1]
 228         filename = last_type.findall('./filename')[0].text
 229         duration = float(last_type.findall('./duration')[0].text)
 230
 231         video_url = 'http://video2.spiegel.de/flash/' + filename
 232         video_ext = filename.rpartition('.')[2]
 233         info = {
 234             'id': video_id,
 235             'url': video_url,
 236             'ext': video_ext,
 237             'title': video_title,
 238             'duration': duration,
 239         }
 240         return [info]
 241
 242 class LiveLeakIE(InfoExtractor):
 243
 244     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 245     IE_NAME = u'liveleak'
 246
 247     def _real_extract(self, url):
 248         mobj = re.match(self._VALID_URL, url)
 249         if mobj is None:
 250             raise ExtractorError(u'Invalid URL: %s' % url)
 251
 252         video_id = mobj.group('video_id')
 253
 254         webpage = self._download_webpage(url, video_id)
 255
 256         video_url = self._search_regex(r'file: "(.*?)",',
 257             webpage, u'video URL')
 258
 259         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 260             webpage, u'title').replace('LiveLeak.com -', '').strip()
 261
 262         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 263             webpage, u'description', fatal=False)
 264
 265         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
 266             webpage, u'uploader', fatal=False)
 267
 268         info = {
 269             'id':  video_id,
 270             'url': video_url,
 271             'ext': 'mp4',
 272             'title': video_title,
 273             'description': video_description,
 274             'uploader': video_uploader
 275         }
 276
 277         return [info]
 278
 279
 280
 281 class TumblrIE(InfoExtractor):
 282     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 283
 284     def _real_extract(self, url):
 285         m_url = re.match(self._VALID_URL, url)
 286         video_id = m_url.group('id')
 287         blog = m_url.group('blog_name')
 288
 289         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
 290         webpage = self._download_webpage(url, video_id)
 291
 292         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
 293         video = re.search(re_video, webpage)
 294         if video is None:
 295            raise ExtractorError(u'Unable to extract video')
 296         video_url = video.group('video_url')
 297         ext = video.group('ext')
 298
 299         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
 300             webpage, u'thumbnail', fatal=False)  # We pick the first poster
 301         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 302
 303         # The only place where you can get a title, it's not complete,
 304         # but searching in other places doesn't work for all videos
 305         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
 306             webpage, u'title', flags=re.DOTALL)
 307
 308         return [{'id': video_id,
 309                  'url': video_url,
 310                  'title': video_title,
 311                  'thumbnail': video_thumbnail,
 312                  'ext': ext
 313                  }]
 314
 315 class BandcampIE(InfoExtractor):
 316     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
 317
 318     def _real_extract(self, url):
 319         mobj = re.match(self._VALID_URL, url)
 320         title = mobj.group('title')
 321         webpage = self._download_webpage(url, title)
 322         # We get the link to the free download page
 323         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
 324         if m_download is None:
 325             raise ExtractorError(u'No free songs found')
 326
 327         download_link = m_download.group(1)
 328         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
 329                        webpage, re.MULTILINE|re.DOTALL).group('id')
 330
 331         download_webpage = self._download_webpage(download_link, id,
 332                                                   'Downloading free downloads page')
 333         # We get the dictionary of the track from some javascrip code
 334         info = re.search(r'items: (.*?),$',
 335                          download_webpage, re.MULTILINE).group(1)
 336         info = json.loads(info)[0]
 337         # We pick mp3-320 for now, until format selection can be easily implemented.
 338         mp3_info = info[u'downloads'][u'mp3-320']
 339         # If we try to use this url it says the link has expired
 340         initial_url = mp3_info[u'url']
 341         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
 342         m_url = re.match(re_url, initial_url)
 343         #We build the url we will use to get the final track url
 344         # This url is build in Bandcamp in the script download_bunde_*.js
 345         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
 346         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
 347         # If we could correctly generate the .rand field the url would be
 348         #in the "download_url" key
 349         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
 350
 351         track_info = {'id':id,
 352                       'title' : info[u'title'],
 353                       'ext' :   'mp3',
 354                       'url' :   final_url,
 355                       'thumbnail' : info[u'thumb_url'],
 356                       'uploader' :  info[u'artist']
 357                       }
 358
 359         return [track_info]
 360
 361 class RedTubeIE(InfoExtractor):
 362     """Information Extractor for redtube"""
 363     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 364
 365     def _real_extract(self,url):
 366         mobj = re.match(self._VALID_URL, url)
 367         if mobj is None:
 368             raise ExtractorError(u'Invalid URL: %s' % url)
 369
 370         video_id = mobj.group('id')
 371         video_extension = 'mp4'
 372         webpage = self._download_webpage(url, video_id)
 373
 374         self.report_extraction(video_id)
 375
 376         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 377             webpage, u'video URL')
 378
 379         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 380             webpage, u'title')
 381
 382         return [{
 383             'id':       video_id,
 384             'url':      video_url,
 385             'ext':      video_extension,
 386             'title':    video_title,
 387         }]
 388
 389 class InaIE(InfoExtractor):
 390     """Information Extractor for Ina.fr"""
 391     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 392
 393     def _real_extract(self,url):
 394         mobj = re.match(self._VALID_URL, url)
 395
 396         video_id = mobj.group('id')
 397         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 398         video_extension = 'mp4'
 399         webpage = self._download_webpage(mrss_url, video_id)
 400
 401         self.report_extraction(video_id)
 402
 403         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 404             webpage, u'video URL')
 405
 406         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 407             webpage, u'title')
 408
 409         return [{
 410             'id':       video_id,
 411             'url':      video_url,
 412             'ext':      video_extension,
 413             'title':    video_title,
 414         }]
 415
 416 class HowcastIE(InfoExtractor):
 417     """Information Extractor for Howcast.com"""
 418     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 419
 420     def _real_extract(self, url):
 421         mobj = re.match(self._VALID_URL, url)
 422
 423         video_id = mobj.group('id')
 424         webpage_url = 'http://www.howcast.com/videos/' + video_id
 425         webpage = self._download_webpage(webpage_url, video_id)
 426
 427         self.report_extraction(video_id)
 428
 429         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 430             webpage, u'video URL')
 431
 432         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 433             webpage, u'title')
 434
 435         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 436             webpage, u'description', fatal=False)
 437
 438         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 439             webpage, u'thumbnail', fatal=False)
 440
 441         return [{
 442             'id':       video_id,
 443             'url':      video_url,
 444             'ext':      'mp4',
 445             'title':    video_title,
 446             'description': video_description,
 447             'thumbnail': thumbnail,
 448         }]
 449
 450 class VineIE(InfoExtractor):
 451     """Information Extractor for Vine.co"""
 452     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 453
 454     def _real_extract(self, url):
 455         mobj = re.match(self._VALID_URL, url)
 456
 457         video_id = mobj.group('id')
 458         webpage_url = 'https://vine.co/v/' + video_id
 459         webpage = self._download_webpage(webpage_url, video_id)
 460
 461         self.report_extraction(video_id)
 462
 463         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
 464             webpage, u'video URL')
 465
 466         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 467             webpage, u'title')
 468
 469         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
 470             webpage, u'thumbnail', fatal=False)
 471
 472         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
 473             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 474
 475         return [{
 476             'id':        video_id,
 477             'url':       video_url,
 478             'ext':       'mp4',
 479             'title':     video_title,
 480             'thumbnail': thumbnail,
 481             'uploader':  uploader,
 482         }]
 483
 484 class FlickrIE(InfoExtractor):
 485     """Information Extractor for Flickr videos"""
 486     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 487
 488     def _real_extract(self, url):
 489         mobj = re.match(self._VALID_URL, url)
 490
 491         video_id = mobj.group('id')
 492         video_uploader_id = mobj.group('uploader_id')
 493         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 494         webpage = self._download_webpage(webpage_url, video_id)
 495
 496         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 497
 498         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 499         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 500
 501         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 502             first_xml, u'node_id')
 503
 504         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 505         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 506
 507         self.report_extraction(video_id)
 508
 509         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 510         if mobj is None:
 511             raise ExtractorError(u'Unable to extract video url')
 512         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 513
 514         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 515             webpage, u'video title')
 516
 517         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 518             webpage, u'description', fatal=False)
 519
 520         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 521             webpage, u'thumbnail', fatal=False)
 522
 523         return [{
 524             'id':          video_id,
 525             'url':         video_url,
 526             'ext':         'mp4',
 527             'title':       video_title,
 528             'description': video_description,
 529             'thumbnail':   thumbnail,
 530             'uploader_id': video_uploader_id,
 531         }]
 532
 533 class TeamcocoIE(InfoExtractor):
 534     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 535
 536     def _real_extract(self, url):
 537         mobj = re.match(self._VALID_URL, url)
 538         if mobj is None:
 539             raise ExtractorError(u'Invalid URL: %s' % url)
 540         url_title = mobj.group('url_title')
 541         webpage = self._download_webpage(url, url_title)
 542
 543         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 544             webpage, u'video id')
 545
 546         self.report_extraction(video_id)
 547
 548         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 549             webpage, u'title')
 550
 551         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 552             webpage, u'thumbnail', fatal=False)
 553
 554         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 555             webpage, u'description', fatal=False)
 556
 557         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 558         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 559
 560         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 561             data, u'video URL')
 562
 563         return [{
 564             'id':          video_id,
 565             'url':         video_url,
 566             'ext':         'mp4',
 567             'title':       video_title,
 568             'thumbnail':   thumbnail,
 569             'description': video_description,
 570         }]
 571
 572 class XHamsterIE(InfoExtractor):
 573     """Information Extractor for xHamster"""
 574     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 575
 576     def _real_extract(self,url):
 577         mobj = re.match(self._VALID_URL, url)
 578
 579         video_id = mobj.group('id')
 580         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 581         webpage = self._download_webpage(mrss_url, video_id)
 582
 583         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 584         if mobj is None:
 585             raise ExtractorError(u'Unable to extract media URL')
 586         if len(mobj.group('server')) == 0:
 587             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 588         else:
 589             video_url = mobj.group('server')+'/key='+mobj.group('file')
 590         video_extension = video_url.split('.')[-1]
 591
 592         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 593             webpage, u'title')
 594
 595         # Can't see the description anywhere in the UI
 596         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 597         #     webpage, u'description', fatal=False)
 598         # if video_description: video_description = unescapeHTML(video_description)
 599
 600         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 601         if mobj:
 602             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 603         else:
 604             video_upload_date = None
 605             self._downloader.report_warning(u'Unable to extract upload date')
 606
 607         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 608             webpage, u'uploader id', default=u'anonymous')
 609
 610         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 611             webpage, u'thumbnail', fatal=False)
 612
 613         return [{
 614             'id':       video_id,
 615             'url':      video_url,
 616             'ext':      video_extension,
 617             'title':    video_title,
 618             # 'description': video_description,
 619             'upload_date': video_upload_date,
 620             'uploader_id': video_uploader_id,
 621             'thumbnail': video_thumbnail
 622         }]
 623
 624 class HypemIE(InfoExtractor):
 625     """Information Extractor for hypem"""
 626     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 627
 628     def _real_extract(self, url):
 629         mobj = re.match(self._VALID_URL, url)
 630         if mobj is None:
 631             raise ExtractorError(u'Invalid URL: %s' % url)
 632         track_id = mobj.group(1)
 633
 634         data = { 'ax': 1, 'ts': time.time() }
 635         data_encoded = compat_urllib_parse.urlencode(data)
 636         complete_url = url + "?" + data_encoded
 637         request = compat_urllib_request.Request(complete_url)
 638         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 639         cookie = urlh.headers.get('Set-Cookie', '')
 640
 641         self.report_extraction(track_id)
 642
 643         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 644             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 645         try:
 646             track_list = json.loads(html_tracks)
 647             track = track_list[u'tracks'][0]
 648         except ValueError:
 649             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 650
 651         key = track[u"key"]
 652         track_id = track[u"id"]
 653         artist = track[u"artist"]
 654         title = track[u"song"]
 655
 656         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 657         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 658         request.add_header('cookie', cookie)
 659         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 660         try:
 661             song_data = json.loads(song_data_json)
 662         except ValueError:
 663             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 664         final_url = song_data[u"url"]
 665
 666         return [{
 667             'id':       track_id,
 668             'url':      final_url,
 669             'ext':      "mp3",
 670             'title':    title,
 671             'artist':   artist,
 672         }]
 673
 674 class Vbox7IE(InfoExtractor):
 675     """Information Extractor for Vbox7"""
 676     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
 677
 678     def _real_extract(self,url):
 679         mobj = re.match(self._VALID_URL, url)
 680         if mobj is None:
 681             raise ExtractorError(u'Invalid URL: %s' % url)
 682         video_id = mobj.group(1)
 683
 684         redirect_page, urlh = self._download_webpage_handle(url, video_id)
 685         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
 686         redirect_url = urlh.geturl() + new_location
 687         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
 688
 689         title = self._html_search_regex(r'<title>(.*)</title>',
 690             webpage, u'title').split('/')[0].strip()
 691
 692         ext = "flv"
 693         info_url = "http://vbox7.com/play/magare.do"
 694         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
 695         info_request = compat_urllib_request.Request(info_url, data)
 696         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 697         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
 698         if info_response is None:
 699             raise ExtractorError(u'Unable to extract the media url')
 700         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
 701
 702         return [{
 703             'id':        video_id,
 704             'url':       final_url,
 705             'ext':       ext,
 706             'title':     title,
 707             'thumbnail': thumbnail_url,
 708         }]
 709
 710
 711 def gen_extractors():
 712     """ Return a list of an instance of every supported extractor.
 713     The order does matter; the first extractor matched is the one handling the URL.
 714     """
 715     return [
 716         YoutubePlaylistIE(),
 717         YoutubeChannelIE(),
 718         YoutubeUserIE(),
 719         YoutubeSearchIE(),
 720         YoutubeIE(),
 721         MetacafeIE(),
 722         DailymotionIE(),
 723         GoogleSearchIE(),
 724         PhotobucketIE(),
 725         YahooIE(),
 726         YahooSearchIE(),
 727         DepositFilesIE(),
 728         FacebookIE(),
 729         BlipTVIE(),
 730         BlipTVUserIE(),
 731         VimeoIE(),
 732         MyVideoIE(),
 733         ComedyCentralIE(),
 734         EscapistIE(),
 735         CollegeHumorIE(),
 736         XVideosIE(),
 737         SoundcloudSetIE(),
 738         SoundcloudIE(),
 739         InfoQIE(),
 740         MixcloudIE(),
 741         StanfordOpenClassroomIE(),
 742         MTVIE(),
 743         YoukuIE(),
 744         XNXXIE(),
 745         YouJizzIE(),
 746         PornotubeIE(),
 747         YouPornIE(),
 748         GooglePlusIE(),
 749         ArteTvIE(),
 750         NBAIE(),
 751         WorldStarHipHopIE(),
 752         JustinTVIE(),
 753         FunnyOrDieIE(),
 754         SteamIE(),
 755         UstreamIE(),
 756         RBMARadioIE(),
 757         EightTracksIE(),
 758         KeekIE(),
 759         TEDIE(),
 760         MySpassIE(),
 761         SpiegelIE(),
 762         LiveLeakIE(),
 763         ARDIE(),
 764         ZDFIE(),
 765         TumblrIE(),
 766         BandcampIE(),
 767         RedTubeIE(),
 768         InaIE(),
 769         HowcastIE(),
 770         VineIE(),
 771         FlickrIE(),
 772         TeamcocoIE(),
 773         XHamsterIE(),
 774         HypemIE(),
 775         Vbox7IE(),
 776         GametrailersIE(),
 777         StatigramIE(),
 778         GenericIE()
 779     ]
 780
 781 def get_info_extractor(ie_name):
 782     """Returns the info extractor class with the given ie_name"""
 783     return globals()[ie_name+'IE']