_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.funnyordie import FunnyOrDieIE
  31 from .extractor.gametrailers import GametrailersIE
  32 from .extractor.generic import GenericIE
  33 from .extractor.googleplus import GooglePlusIE
  34 from .extractor.googlesearch import GoogleSearchIE
  35 from .extractor.infoq import InfoQIE
  36 from .extractor.justintv import JustinTVIE
  37 from .extractor.metacafe import MetacafeIE
  38 from .extractor.mixcloud import MixcloudIE
  39 from .extractor.mtv import MTVIE
  40 from .extractor.myvideo import MyVideoIE
  41 from .extractor.nba import NBAIE
  42 from .extractor.statigram import StatigramIE
  43 from .extractor.photobucket import PhotobucketIE
  44 from .extractor.rbmaradio import RBMARadioIE
  45 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  46 from .extractor.stanfordoc import StanfordOpenClassroomIE
  47 from .extractor.steam import SteamIE
  48 from .extractor.ted import TEDIE
  49 from .extractor.ustream import UstreamIE
  50 from .extractor.vimeo import VimeoIE
  51 from .extractor.worldstarhiphop import WorldStarHipHopIE
  52 from .extractor.xnxx import XNXXIE
  53 from .extractor.xvideos import XVideosIE
  54 from .extractor.yahoo import YahooIE, YahooSearchIE
  55 from .extractor.youku import YoukuIE
  56 from .extractor.youporn import YouPornIE
  57 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  58 from .extractor.zdf import ZDFIE
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82 class PornotubeIE(InfoExtractor):
  83     """Information extractor for pornotube.com."""
  84     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
  85
  86     def _real_extract(self, url):
  87         mobj = re.match(self._VALID_URL, url)
  88         if mobj is None:
  89             raise ExtractorError(u'Invalid URL: %s' % url)
  90
  91         video_id = mobj.group('videoid')
  92         video_title = mobj.group('title')
  93
  94         # Get webpage content
  95         webpage = self._download_webpage(url, video_id)
  96
  97         # Get the video URL
  98         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
  99         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
 100         video_url = compat_urllib_parse.unquote(video_url)
 101
 102         #Get the uploaded date
 103         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 104         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
 105         if upload_date: upload_date = unified_strdate(upload_date)
 106
 107         info = {'id': video_id,
 108                 'url': video_url,
 109                 'uploader': None,
 110                 'upload_date': upload_date,
 111                 'title': video_title,
 112                 'ext': 'flv',
 113                 'format': 'flv'}
 114
 115         return [info]
 116
 117 class YouJizzIE(InfoExtractor):
 118     """Information extractor for youjizz.com."""
 119     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 120
 121     def _real_extract(self, url):
 122         mobj = re.match(self._VALID_URL, url)
 123         if mobj is None:
 124             raise ExtractorError(u'Invalid URL: %s' % url)
 125
 126         video_id = mobj.group('videoid')
 127
 128         # Get webpage content
 129         webpage = self._download_webpage(url, video_id)
 130
 131         # Get the video title
 132         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
 133             webpage, u'title').strip()
 134
 135         # Get the embed page
 136         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 137         if result is None:
 138             raise ExtractorError(u'ERROR: unable to extract embed page')
 139
 140         embed_page_url = result.group(0).strip()
 141         video_id = result.group('videoid')
 142
 143         webpage = self._download_webpage(embed_page_url, video_id)
 144
 145         # Get the video URL
 146         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
 147             webpage, u'video URL')
 148
 149         info = {'id': video_id,
 150                 'url': video_url,
 151                 'title': video_title,
 152                 'ext': 'flv',
 153                 'format': 'flv',
 154                 'player_url': embed_page_url}
 155
 156         return [info]
 157
 158 class EightTracksIE(InfoExtractor):
 159     IE_NAME = '8tracks'
 160     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 161
 162     def _real_extract(self, url):
 163         mobj = re.match(self._VALID_URL, url)
 164         if mobj is None:
 165             raise ExtractorError(u'Invalid URL: %s' % url)
 166         playlist_id = mobj.group('id')
 167
 168         webpage = self._download_webpage(url, playlist_id)
 169
 170         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
 171         data = json.loads(json_like)
 172
 173         session = str(random.randint(0, 1000000000))
 174         mix_id = data['id']
 175         track_count = data['tracks_count']
 176         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 177         next_url = first_url
 178         res = []
 179         for i in itertools.count():
 180             api_json = self._download_webpage(next_url, playlist_id,
 181                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 182                 errnote=u'Failed to download song information')
 183             api_data = json.loads(api_json)
 184             track_data = api_data[u'set']['track']
 185             info = {
 186                 'id': track_data['id'],
 187                 'url': track_data['track_file_stream_url'],
 188                 'title': track_data['performer'] + u' - ' + track_data['name'],
 189                 'raw_title': track_data['name'],
 190                 'uploader_id': data['user']['login'],
 191                 'ext': 'm4a',
 192             }
 193             res.append(info)
 194             if api_data['set']['at_last_track']:
 195                 break
 196             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 197         return res
 198
 199 class KeekIE(InfoExtractor):
 200     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 201     IE_NAME = u'keek'
 202
 203     def _real_extract(self, url):
 204         m = re.match(self._VALID_URL, url)
 205         video_id = m.group('videoID')
 206
 207         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 208         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 209         webpage = self._download_webpage(url, video_id)
 210
 211         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 212             webpage, u'title')
 213
 214         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 215             webpage, u'uploader', fatal=False)
 216
 217         info = {
 218                 'id': video_id,
 219                 'url': video_url,
 220                 'ext': 'mp4',
 221                 'title': video_title,
 222                 'thumbnail': thumbnail,
 223                 'uploader': uploader
 224         }
 225         return [info]
 226
 227
 228 class MySpassIE(InfoExtractor):
 229     _VALID_URL = r'http://www.myspass.de/.*'
 230
 231     def _real_extract(self, url):
 232         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 233
 234         # video id is the last path element of the URL
 235         # usually there is a trailing slash, so also try the second but last
 236         url_path = compat_urllib_parse_urlparse(url).path
 237         url_parent_path, video_id = os.path.split(url_path)
 238         if not video_id:
 239             _, video_id = os.path.split(url_parent_path)
 240
 241         # get metadata
 242         metadata_url = META_DATA_URL_TEMPLATE % video_id
 243         metadata_text = self._download_webpage(metadata_url, video_id)
 244         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 245
 246         # extract values from metadata
 247         url_flv_el = metadata.find('url_flv')
 248         if url_flv_el is None:
 249             raise ExtractorError(u'Unable to extract download url')
 250         video_url = url_flv_el.text
 251         extension = os.path.splitext(video_url)[1][1:]
 252         title_el = metadata.find('title')
 253         if title_el is None:
 254             raise ExtractorError(u'Unable to extract title')
 255         title = title_el.text
 256         format_id_el = metadata.find('format_id')
 257         if format_id_el is None:
 258             format = ext
 259         else:
 260             format = format_id_el.text
 261         description_el = metadata.find('description')
 262         if description_el is not None:
 263             description = description_el.text
 264         else:
 265             description = None
 266         imagePreview_el = metadata.find('imagePreview')
 267         if imagePreview_el is not None:
 268             thumbnail = imagePreview_el.text
 269         else:
 270             thumbnail = None
 271         info = {
 272             'id': video_id,
 273             'url': video_url,
 274             'title': title,
 275             'ext': extension,
 276             'format': format,
 277             'thumbnail': thumbnail,
 278             'description': description
 279         }
 280         return [info]
 281
 282 class SpiegelIE(InfoExtractor):
 283     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 284
 285     def _real_extract(self, url):
 286         m = re.match(self._VALID_URL, url)
 287         video_id = m.group('videoID')
 288
 289         webpage = self._download_webpage(url, video_id)
 290
 291         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
 292             webpage, u'title')
 293
 294         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 295         xml_code = self._download_webpage(xml_url, video_id,
 296                     note=u'Downloading XML', errnote=u'Failed to download XML')
 297
 298         idoc = xml.etree.ElementTree.fromstring(xml_code)
 299         last_type = idoc[-1]
 300         filename = last_type.findall('./filename')[0].text
 301         duration = float(last_type.findall('./duration')[0].text)
 302
 303         video_url = 'http://video2.spiegel.de/flash/' + filename
 304         video_ext = filename.rpartition('.')[2]
 305         info = {
 306             'id': video_id,
 307             'url': video_url,
 308             'ext': video_ext,
 309             'title': video_title,
 310             'duration': duration,
 311         }
 312         return [info]
 313
 314 class LiveLeakIE(InfoExtractor):
 315
 316     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 317     IE_NAME = u'liveleak'
 318
 319     def _real_extract(self, url):
 320         mobj = re.match(self._VALID_URL, url)
 321         if mobj is None:
 322             raise ExtractorError(u'Invalid URL: %s' % url)
 323
 324         video_id = mobj.group('video_id')
 325
 326         webpage = self._download_webpage(url, video_id)
 327
 328         video_url = self._search_regex(r'file: "(.*?)",',
 329             webpage, u'video URL')
 330
 331         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 332             webpage, u'title').replace('LiveLeak.com -', '').strip()
 333
 334         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 335             webpage, u'description', fatal=False)
 336
 337         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
 338             webpage, u'uploader', fatal=False)
 339
 340         info = {
 341             'id':  video_id,
 342             'url': video_url,
 343             'ext': 'mp4',
 344             'title': video_title,
 345             'description': video_description,
 346             'uploader': video_uploader
 347         }
 348
 349         return [info]
 350
 351
 352
 353 class TumblrIE(InfoExtractor):
 354     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 355
 356     def _real_extract(self, url):
 357         m_url = re.match(self._VALID_URL, url)
 358         video_id = m_url.group('id')
 359         blog = m_url.group('blog_name')
 360
 361         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
 362         webpage = self._download_webpage(url, video_id)
 363
 364         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
 365         video = re.search(re_video, webpage)
 366         if video is None:
 367            raise ExtractorError(u'Unable to extract video')
 368         video_url = video.group('video_url')
 369         ext = video.group('ext')
 370
 371         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
 372             webpage, u'thumbnail', fatal=False)  # We pick the first poster
 373         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 374
 375         # The only place where you can get a title, it's not complete,
 376         # but searching in other places doesn't work for all videos
 377         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
 378             webpage, u'title', flags=re.DOTALL)
 379
 380         return [{'id': video_id,
 381                  'url': video_url,
 382                  'title': video_title,
 383                  'thumbnail': video_thumbnail,
 384                  'ext': ext
 385                  }]
 386
 387 class BandcampIE(InfoExtractor):
 388     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
 389
 390     def _real_extract(self, url):
 391         mobj = re.match(self._VALID_URL, url)
 392         title = mobj.group('title')
 393         webpage = self._download_webpage(url, title)
 394         # We get the link to the free download page
 395         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
 396         if m_download is None:
 397             raise ExtractorError(u'No free songs found')
 398
 399         download_link = m_download.group(1)
 400         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
 401                        webpage, re.MULTILINE|re.DOTALL).group('id')
 402
 403         download_webpage = self._download_webpage(download_link, id,
 404                                                   'Downloading free downloads page')
 405         # We get the dictionary of the track from some javascrip code
 406         info = re.search(r'items: (.*?),$',
 407                          download_webpage, re.MULTILINE).group(1)
 408         info = json.loads(info)[0]
 409         # We pick mp3-320 for now, until format selection can be easily implemented.
 410         mp3_info = info[u'downloads'][u'mp3-320']
 411         # If we try to use this url it says the link has expired
 412         initial_url = mp3_info[u'url']
 413         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
 414         m_url = re.match(re_url, initial_url)
 415         #We build the url we will use to get the final track url
 416         # This url is build in Bandcamp in the script download_bunde_*.js
 417         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
 418         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
 419         # If we could correctly generate the .rand field the url would be
 420         #in the "download_url" key
 421         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
 422
 423         track_info = {'id':id,
 424                       'title' : info[u'title'],
 425                       'ext' :   'mp3',
 426                       'url' :   final_url,
 427                       'thumbnail' : info[u'thumb_url'],
 428                       'uploader' :  info[u'artist']
 429                       }
 430
 431         return [track_info]
 432
 433 class RedTubeIE(InfoExtractor):
 434     """Information Extractor for redtube"""
 435     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 436
 437     def _real_extract(self,url):
 438         mobj = re.match(self._VALID_URL, url)
 439         if mobj is None:
 440             raise ExtractorError(u'Invalid URL: %s' % url)
 441
 442         video_id = mobj.group('id')
 443         video_extension = 'mp4'
 444         webpage = self._download_webpage(url, video_id)
 445
 446         self.report_extraction(video_id)
 447
 448         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 449             webpage, u'video URL')
 450
 451         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 452             webpage, u'title')
 453
 454         return [{
 455             'id':       video_id,
 456             'url':      video_url,
 457             'ext':      video_extension,
 458             'title':    video_title,
 459         }]
 460
 461 class InaIE(InfoExtractor):
 462     """Information Extractor for Ina.fr"""
 463     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 464
 465     def _real_extract(self,url):
 466         mobj = re.match(self._VALID_URL, url)
 467
 468         video_id = mobj.group('id')
 469         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 470         video_extension = 'mp4'
 471         webpage = self._download_webpage(mrss_url, video_id)
 472
 473         self.report_extraction(video_id)
 474
 475         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 476             webpage, u'video URL')
 477
 478         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 479             webpage, u'title')
 480
 481         return [{
 482             'id':       video_id,
 483             'url':      video_url,
 484             'ext':      video_extension,
 485             'title':    video_title,
 486         }]
 487
 488 class HowcastIE(InfoExtractor):
 489     """Information Extractor for Howcast.com"""
 490     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 491
 492     def _real_extract(self, url):
 493         mobj = re.match(self._VALID_URL, url)
 494
 495         video_id = mobj.group('id')
 496         webpage_url = 'http://www.howcast.com/videos/' + video_id
 497         webpage = self._download_webpage(webpage_url, video_id)
 498
 499         self.report_extraction(video_id)
 500
 501         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 502             webpage, u'video URL')
 503
 504         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 505             webpage, u'title')
 506
 507         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 508             webpage, u'description', fatal=False)
 509
 510         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 511             webpage, u'thumbnail', fatal=False)
 512
 513         return [{
 514             'id':       video_id,
 515             'url':      video_url,
 516             'ext':      'mp4',
 517             'title':    video_title,
 518             'description': video_description,
 519             'thumbnail': thumbnail,
 520         }]
 521
 522 class VineIE(InfoExtractor):
 523     """Information Extractor for Vine.co"""
 524     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 525
 526     def _real_extract(self, url):
 527         mobj = re.match(self._VALID_URL, url)
 528
 529         video_id = mobj.group('id')
 530         webpage_url = 'https://vine.co/v/' + video_id
 531         webpage = self._download_webpage(webpage_url, video_id)
 532
 533         self.report_extraction(video_id)
 534
 535         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
 536             webpage, u'video URL')
 537
 538         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 539             webpage, u'title')
 540
 541         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
 542             webpage, u'thumbnail', fatal=False)
 543
 544         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
 545             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 546
 547         return [{
 548             'id':        video_id,
 549             'url':       video_url,
 550             'ext':       'mp4',
 551             'title':     video_title,
 552             'thumbnail': thumbnail,
 553             'uploader':  uploader,
 554         }]
 555
 556 class FlickrIE(InfoExtractor):
 557     """Information Extractor for Flickr videos"""
 558     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 559
 560     def _real_extract(self, url):
 561         mobj = re.match(self._VALID_URL, url)
 562
 563         video_id = mobj.group('id')
 564         video_uploader_id = mobj.group('uploader_id')
 565         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 566         webpage = self._download_webpage(webpage_url, video_id)
 567
 568         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 569
 570         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 571         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 572
 573         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 574             first_xml, u'node_id')
 575
 576         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 577         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 578
 579         self.report_extraction(video_id)
 580
 581         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 582         if mobj is None:
 583             raise ExtractorError(u'Unable to extract video url')
 584         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 585
 586         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 587             webpage, u'video title')
 588
 589         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 590             webpage, u'description', fatal=False)
 591
 592         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 593             webpage, u'thumbnail', fatal=False)
 594
 595         return [{
 596             'id':          video_id,
 597             'url':         video_url,
 598             'ext':         'mp4',
 599             'title':       video_title,
 600             'description': video_description,
 601             'thumbnail':   thumbnail,
 602             'uploader_id': video_uploader_id,
 603         }]
 604
 605 class TeamcocoIE(InfoExtractor):
 606     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 607
 608     def _real_extract(self, url):
 609         mobj = re.match(self._VALID_URL, url)
 610         if mobj is None:
 611             raise ExtractorError(u'Invalid URL: %s' % url)
 612         url_title = mobj.group('url_title')
 613         webpage = self._download_webpage(url, url_title)
 614
 615         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 616             webpage, u'video id')
 617
 618         self.report_extraction(video_id)
 619
 620         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 621             webpage, u'title')
 622
 623         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 624             webpage, u'thumbnail', fatal=False)
 625
 626         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 627             webpage, u'description', fatal=False)
 628
 629         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 630         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 631
 632         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 633             data, u'video URL')
 634
 635         return [{
 636             'id':          video_id,
 637             'url':         video_url,
 638             'ext':         'mp4',
 639             'title':       video_title,
 640             'thumbnail':   thumbnail,
 641             'description': video_description,
 642         }]
 643
 644 class XHamsterIE(InfoExtractor):
 645     """Information Extractor for xHamster"""
 646     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 647
 648     def _real_extract(self,url):
 649         mobj = re.match(self._VALID_URL, url)
 650
 651         video_id = mobj.group('id')
 652         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 653         webpage = self._download_webpage(mrss_url, video_id)
 654
 655         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 656         if mobj is None:
 657             raise ExtractorError(u'Unable to extract media URL')
 658         if len(mobj.group('server')) == 0:
 659             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 660         else:
 661             video_url = mobj.group('server')+'/key='+mobj.group('file')
 662         video_extension = video_url.split('.')[-1]
 663
 664         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 665             webpage, u'title')
 666
 667         # Can't see the description anywhere in the UI
 668         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 669         #     webpage, u'description', fatal=False)
 670         # if video_description: video_description = unescapeHTML(video_description)
 671
 672         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 673         if mobj:
 674             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 675         else:
 676             video_upload_date = None
 677             self._downloader.report_warning(u'Unable to extract upload date')
 678
 679         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 680             webpage, u'uploader id', default=u'anonymous')
 681
 682         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 683             webpage, u'thumbnail', fatal=False)
 684
 685         return [{
 686             'id':       video_id,
 687             'url':      video_url,
 688             'ext':      video_extension,
 689             'title':    video_title,
 690             # 'description': video_description,
 691             'upload_date': video_upload_date,
 692             'uploader_id': video_uploader_id,
 693             'thumbnail': video_thumbnail
 694         }]
 695
 696 class HypemIE(InfoExtractor):
 697     """Information Extractor for hypem"""
 698     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 699
 700     def _real_extract(self, url):
 701         mobj = re.match(self._VALID_URL, url)
 702         if mobj is None:
 703             raise ExtractorError(u'Invalid URL: %s' % url)
 704         track_id = mobj.group(1)
 705
 706         data = { 'ax': 1, 'ts': time.time() }
 707         data_encoded = compat_urllib_parse.urlencode(data)
 708         complete_url = url + "?" + data_encoded
 709         request = compat_urllib_request.Request(complete_url)
 710         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 711         cookie = urlh.headers.get('Set-Cookie', '')
 712
 713         self.report_extraction(track_id)
 714
 715         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 716             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 717         try:
 718             track_list = json.loads(html_tracks)
 719             track = track_list[u'tracks'][0]
 720         except ValueError:
 721             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 722
 723         key = track[u"key"]
 724         track_id = track[u"id"]
 725         artist = track[u"artist"]
 726         title = track[u"song"]
 727
 728         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 729         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 730         request.add_header('cookie', cookie)
 731         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 732         try:
 733             song_data = json.loads(song_data_json)
 734         except ValueError:
 735             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 736         final_url = song_data[u"url"]
 737
 738         return [{
 739             'id':       track_id,
 740             'url':      final_url,
 741             'ext':      "mp3",
 742             'title':    title,
 743             'artist':   artist,
 744         }]
 745
 746 class Vbox7IE(InfoExtractor):
 747     """Information Extractor for Vbox7"""
 748     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
 749
 750     def _real_extract(self,url):
 751         mobj = re.match(self._VALID_URL, url)
 752         if mobj is None:
 753             raise ExtractorError(u'Invalid URL: %s' % url)
 754         video_id = mobj.group(1)
 755
 756         redirect_page, urlh = self._download_webpage_handle(url, video_id)
 757         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
 758         redirect_url = urlh.geturl() + new_location
 759         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
 760
 761         title = self._html_search_regex(r'<title>(.*)</title>',
 762             webpage, u'title').split('/')[0].strip()
 763
 764         ext = "flv"
 765         info_url = "http://vbox7.com/play/magare.do"
 766         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
 767         info_request = compat_urllib_request.Request(info_url, data)
 768         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 769         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
 770         if info_response is None:
 771             raise ExtractorError(u'Unable to extract the media url')
 772         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
 773
 774         return [{
 775             'id':        video_id,
 776             'url':       final_url,
 777             'ext':       ext,
 778             'title':     title,
 779             'thumbnail': thumbnail_url,
 780         }]
 781
 782
 783 def gen_extractors():
 784     """ Return a list of an instance of every supported extractor.
 785     The order does matter; the first extractor matched is the one handling the URL.
 786     """
 787     return [
 788         YoutubePlaylistIE(),
 789         YoutubeChannelIE(),
 790         YoutubeUserIE(),
 791         YoutubeSearchIE(),
 792         YoutubeIE(),
 793         MetacafeIE(),
 794         DailymotionIE(),
 795         GoogleSearchIE(),
 796         PhotobucketIE(),
 797         YahooIE(),
 798         YahooSearchIE(),
 799         DepositFilesIE(),
 800         FacebookIE(),
 801         BlipTVIE(),
 802         BlipTVUserIE(),
 803         VimeoIE(),
 804         MyVideoIE(),
 805         ComedyCentralIE(),
 806         EscapistIE(),
 807         CollegeHumorIE(),
 808         XVideosIE(),
 809         SoundcloudSetIE(),
 810         SoundcloudIE(),
 811         InfoQIE(),
 812         MixcloudIE(),
 813         StanfordOpenClassroomIE(),
 814         MTVIE(),
 815         YoukuIE(),
 816         XNXXIE(),
 817         YouJizzIE(),
 818         PornotubeIE(),
 819         YouPornIE(),
 820         GooglePlusIE(),
 821         ArteTvIE(),
 822         NBAIE(),
 823         WorldStarHipHopIE(),
 824         JustinTVIE(),
 825         FunnyOrDieIE(),
 826         SteamIE(),
 827         UstreamIE(),
 828         RBMARadioIE(),
 829         EightTracksIE(),
 830         KeekIE(),
 831         TEDIE(),
 832         MySpassIE(),
 833         SpiegelIE(),
 834         LiveLeakIE(),
 835         ARDIE(),
 836         ZDFIE(),
 837         TumblrIE(),
 838         BandcampIE(),
 839         RedTubeIE(),
 840         InaIE(),
 841         HowcastIE(),
 842         VineIE(),
 843         FlickrIE(),
 844         TeamcocoIE(),
 845         XHamsterIE(),
 846         HypemIE(),
 847         Vbox7IE(),
 848         GametrailersIE(),
 849         StatigramIE(),
 850         GenericIE()
 851     ]
 852
 853 def get_info_extractor(ie_name):
 854     """Returns the info extractor class with the given ie_name"""
 855     return globals()[ie_name+'IE']