_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.eighttracks import EightTracksIE
  29 from .extractor.escapist import EscapistIE
  30 from .extractor.facebook import FacebookIE
  31 from .extractor.funnyordie import FunnyOrDieIE
  32 from .extractor.gametrailers import GametrailersIE
  33 from .extractor.generic import GenericIE
  34 from .extractor.googleplus import GooglePlusIE
  35 from .extractor.googlesearch import GoogleSearchIE
  36 from .extractor.infoq import InfoQIE
  37 from .extractor.justintv import JustinTVIE
  38 from .extractor.keek import KeekIE
  39 from .extractor.liveleak import LiveLeakIE
  40 from .extractor.metacafe import MetacafeIE
  41 from .extractor.mixcloud import MixcloudIE
  42 from .extractor.mtv import MTVIE
  43 from .extractor.myspass import MySpassIE
  44 from .extractor.myvideo import MyVideoIE
  45 from .extractor.nba import NBAIE
  46 from .extractor.statigram import StatigramIE
  47 from .extractor.photobucket import PhotobucketIE
  48 from .extractor.pornotube import PornotubeIE
  49 from .extractor.rbmaradio import RBMARadioIE
  50 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  51 from .extractor.spiegel import SpiegelIE
  52 from .extractor.stanfordoc import StanfordOpenClassroomIE
  53 from .extractor.steam import SteamIE
  54 from .extractor.ted import TEDIE
  55 from .extractor.tumblr import TumblrIE
  56 from .extractor.ustream import UstreamIE
  57 from .extractor.vimeo import VimeoIE
  58 from .extractor.worldstarhiphop import WorldStarHipHopIE
  59 from .extractor.xnxx import XNXXIE
  60 from .extractor.xvideos import XVideosIE
  61 from .extractor.yahoo import YahooIE, YahooSearchIE
  62 from .extractor.youjizz import YouJizzIE
  63 from .extractor.youku import YoukuIE
  64 from .extractor.youporn import YouPornIE
  65 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  66 from .extractor.zdf import ZDFIE
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101 class BandcampIE(InfoExtractor):
 102     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
 103
 104     def _real_extract(self, url):
 105         mobj = re.match(self._VALID_URL, url)
 106         title = mobj.group('title')
 107         webpage = self._download_webpage(url, title)
 108         # We get the link to the free download page
 109         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
 110         if m_download is None:
 111             raise ExtractorError(u'No free songs found')
 112
 113         download_link = m_download.group(1)
 114         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
 115                        webpage, re.MULTILINE|re.DOTALL).group('id')
 116
 117         download_webpage = self._download_webpage(download_link, id,
 118                                                   'Downloading free downloads page')
 119         # We get the dictionary of the track from some javascrip code
 120         info = re.search(r'items: (.*?),$',
 121                          download_webpage, re.MULTILINE).group(1)
 122         info = json.loads(info)[0]
 123         # We pick mp3-320 for now, until format selection can be easily implemented.
 124         mp3_info = info[u'downloads'][u'mp3-320']
 125         # If we try to use this url it says the link has expired
 126         initial_url = mp3_info[u'url']
 127         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
 128         m_url = re.match(re_url, initial_url)
 129         #We build the url we will use to get the final track url
 130         # This url is build in Bandcamp in the script download_bunde_*.js
 131         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
 132         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
 133         # If we could correctly generate the .rand field the url would be
 134         #in the "download_url" key
 135         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
 136
 137         track_info = {'id':id,
 138                       'title' : info[u'title'],
 139                       'ext' :   'mp3',
 140                       'url' :   final_url,
 141                       'thumbnail' : info[u'thumb_url'],
 142                       'uploader' :  info[u'artist']
 143                       }
 144
 145         return [track_info]
 146
 147 class RedTubeIE(InfoExtractor):
 148     """Information Extractor for redtube"""
 149     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 150
 151     def _real_extract(self,url):
 152         mobj = re.match(self._VALID_URL, url)
 153         if mobj is None:
 154             raise ExtractorError(u'Invalid URL: %s' % url)
 155
 156         video_id = mobj.group('id')
 157         video_extension = 'mp4'
 158         webpage = self._download_webpage(url, video_id)
 159
 160         self.report_extraction(video_id)
 161
 162         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 163             webpage, u'video URL')
 164
 165         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 166             webpage, u'title')
 167
 168         return [{
 169             'id':       video_id,
 170             'url':      video_url,
 171             'ext':      video_extension,
 172             'title':    video_title,
 173         }]
 174
 175 class InaIE(InfoExtractor):
 176     """Information Extractor for Ina.fr"""
 177     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 178
 179     def _real_extract(self,url):
 180         mobj = re.match(self._VALID_URL, url)
 181
 182         video_id = mobj.group('id')
 183         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 184         video_extension = 'mp4'
 185         webpage = self._download_webpage(mrss_url, video_id)
 186
 187         self.report_extraction(video_id)
 188
 189         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 190             webpage, u'video URL')
 191
 192         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 193             webpage, u'title')
 194
 195         return [{
 196             'id':       video_id,
 197             'url':      video_url,
 198             'ext':      video_extension,
 199             'title':    video_title,
 200         }]
 201
 202 class HowcastIE(InfoExtractor):
 203     """Information Extractor for Howcast.com"""
 204     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 205
 206     def _real_extract(self, url):
 207         mobj = re.match(self._VALID_URL, url)
 208
 209         video_id = mobj.group('id')
 210         webpage_url = 'http://www.howcast.com/videos/' + video_id
 211         webpage = self._download_webpage(webpage_url, video_id)
 212
 213         self.report_extraction(video_id)
 214
 215         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 216             webpage, u'video URL')
 217
 218         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 219             webpage, u'title')
 220
 221         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 222             webpage, u'description', fatal=False)
 223
 224         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 225             webpage, u'thumbnail', fatal=False)
 226
 227         return [{
 228             'id':       video_id,
 229             'url':      video_url,
 230             'ext':      'mp4',
 231             'title':    video_title,
 232             'description': video_description,
 233             'thumbnail': thumbnail,
 234         }]
 235
 236 class VineIE(InfoExtractor):
 237     """Information Extractor for Vine.co"""
 238     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 239
 240     def _real_extract(self, url):
 241         mobj = re.match(self._VALID_URL, url)
 242
 243         video_id = mobj.group('id')
 244         webpage_url = 'https://vine.co/v/' + video_id
 245         webpage = self._download_webpage(webpage_url, video_id)
 246
 247         self.report_extraction(video_id)
 248
 249         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
 250             webpage, u'video URL')
 251
 252         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 253             webpage, u'title')
 254
 255         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
 256             webpage, u'thumbnail', fatal=False)
 257
 258         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
 259             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 260
 261         return [{
 262             'id':        video_id,
 263             'url':       video_url,
 264             'ext':       'mp4',
 265             'title':     video_title,
 266             'thumbnail': thumbnail,
 267             'uploader':  uploader,
 268         }]
 269
 270 class FlickrIE(InfoExtractor):
 271     """Information Extractor for Flickr videos"""
 272     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 273
 274     def _real_extract(self, url):
 275         mobj = re.match(self._VALID_URL, url)
 276
 277         video_id = mobj.group('id')
 278         video_uploader_id = mobj.group('uploader_id')
 279         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 280         webpage = self._download_webpage(webpage_url, video_id)
 281
 282         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 283
 284         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 285         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 286
 287         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 288             first_xml, u'node_id')
 289
 290         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 291         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 292
 293         self.report_extraction(video_id)
 294
 295         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 296         if mobj is None:
 297             raise ExtractorError(u'Unable to extract video url')
 298         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 299
 300         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 301             webpage, u'video title')
 302
 303         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 304             webpage, u'description', fatal=False)
 305
 306         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 307             webpage, u'thumbnail', fatal=False)
 308
 309         return [{
 310             'id':          video_id,
 311             'url':         video_url,
 312             'ext':         'mp4',
 313             'title':       video_title,
 314             'description': video_description,
 315             'thumbnail':   thumbnail,
 316             'uploader_id': video_uploader_id,
 317         }]
 318
 319 class TeamcocoIE(InfoExtractor):
 320     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 321
 322     def _real_extract(self, url):
 323         mobj = re.match(self._VALID_URL, url)
 324         if mobj is None:
 325             raise ExtractorError(u'Invalid URL: %s' % url)
 326         url_title = mobj.group('url_title')
 327         webpage = self._download_webpage(url, url_title)
 328
 329         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 330             webpage, u'video id')
 331
 332         self.report_extraction(video_id)
 333
 334         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 335             webpage, u'title')
 336
 337         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 338             webpage, u'thumbnail', fatal=False)
 339
 340         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 341             webpage, u'description', fatal=False)
 342
 343         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 344         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 345
 346         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 347             data, u'video URL')
 348
 349         return [{
 350             'id':          video_id,
 351             'url':         video_url,
 352             'ext':         'mp4',
 353             'title':       video_title,
 354             'thumbnail':   thumbnail,
 355             'description': video_description,
 356         }]
 357
 358 class XHamsterIE(InfoExtractor):
 359     """Information Extractor for xHamster"""
 360     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 361
 362     def _real_extract(self,url):
 363         mobj = re.match(self._VALID_URL, url)
 364
 365         video_id = mobj.group('id')
 366         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 367         webpage = self._download_webpage(mrss_url, video_id)
 368
 369         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 370         if mobj is None:
 371             raise ExtractorError(u'Unable to extract media URL')
 372         if len(mobj.group('server')) == 0:
 373             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 374         else:
 375             video_url = mobj.group('server')+'/key='+mobj.group('file')
 376         video_extension = video_url.split('.')[-1]
 377
 378         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 379             webpage, u'title')
 380
 381         # Can't see the description anywhere in the UI
 382         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 383         #     webpage, u'description', fatal=False)
 384         # if video_description: video_description = unescapeHTML(video_description)
 385
 386         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 387         if mobj:
 388             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 389         else:
 390             video_upload_date = None
 391             self._downloader.report_warning(u'Unable to extract upload date')
 392
 393         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 394             webpage, u'uploader id', default=u'anonymous')
 395
 396         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 397             webpage, u'thumbnail', fatal=False)
 398
 399         return [{
 400             'id':       video_id,
 401             'url':      video_url,
 402             'ext':      video_extension,
 403             'title':    video_title,
 404             # 'description': video_description,
 405             'upload_date': video_upload_date,
 406             'uploader_id': video_uploader_id,
 407             'thumbnail': video_thumbnail
 408         }]
 409
 410 class HypemIE(InfoExtractor):
 411     """Information Extractor for hypem"""
 412     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 413
 414     def _real_extract(self, url):
 415         mobj = re.match(self._VALID_URL, url)
 416         if mobj is None:
 417             raise ExtractorError(u'Invalid URL: %s' % url)
 418         track_id = mobj.group(1)
 419
 420         data = { 'ax': 1, 'ts': time.time() }
 421         data_encoded = compat_urllib_parse.urlencode(data)
 422         complete_url = url + "?" + data_encoded
 423         request = compat_urllib_request.Request(complete_url)
 424         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 425         cookie = urlh.headers.get('Set-Cookie', '')
 426
 427         self.report_extraction(track_id)
 428
 429         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 430             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 431         try:
 432             track_list = json.loads(html_tracks)
 433             track = track_list[u'tracks'][0]
 434         except ValueError:
 435             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 436
 437         key = track[u"key"]
 438         track_id = track[u"id"]
 439         artist = track[u"artist"]
 440         title = track[u"song"]
 441
 442         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 443         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 444         request.add_header('cookie', cookie)
 445         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 446         try:
 447             song_data = json.loads(song_data_json)
 448         except ValueError:
 449             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 450         final_url = song_data[u"url"]
 451
 452         return [{
 453             'id':       track_id,
 454             'url':      final_url,
 455             'ext':      "mp3",
 456             'title':    title,
 457             'artist':   artist,
 458         }]
 459
 460 class Vbox7IE(InfoExtractor):
 461     """Information Extractor for Vbox7"""
 462     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
 463
 464     def _real_extract(self,url):
 465         mobj = re.match(self._VALID_URL, url)
 466         if mobj is None:
 467             raise ExtractorError(u'Invalid URL: %s' % url)
 468         video_id = mobj.group(1)
 469
 470         redirect_page, urlh = self._download_webpage_handle(url, video_id)
 471         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
 472         redirect_url = urlh.geturl() + new_location
 473         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
 474
 475         title = self._html_search_regex(r'<title>(.*)</title>',
 476             webpage, u'title').split('/')[0].strip()
 477
 478         ext = "flv"
 479         info_url = "http://vbox7.com/play/magare.do"
 480         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
 481         info_request = compat_urllib_request.Request(info_url, data)
 482         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 483         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
 484         if info_response is None:
 485             raise ExtractorError(u'Unable to extract the media url')
 486         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
 487
 488         return [{
 489             'id':        video_id,
 490             'url':       final_url,
 491             'ext':       ext,
 492             'title':     title,
 493             'thumbnail': thumbnail_url,
 494         }]
 495
 496
 497 def gen_extractors():
 498     """ Return a list of an instance of every supported extractor.
 499     The order does matter; the first extractor matched is the one handling the URL.
 500     """
 501     return [
 502         YoutubePlaylistIE(),
 503         YoutubeChannelIE(),
 504         YoutubeUserIE(),
 505         YoutubeSearchIE(),
 506         YoutubeIE(),
 507         MetacafeIE(),
 508         DailymotionIE(),
 509         GoogleSearchIE(),
 510         PhotobucketIE(),
 511         YahooIE(),
 512         YahooSearchIE(),
 513         DepositFilesIE(),
 514         FacebookIE(),
 515         BlipTVIE(),
 516         BlipTVUserIE(),
 517         VimeoIE(),
 518         MyVideoIE(),
 519         ComedyCentralIE(),
 520         EscapistIE(),
 521         CollegeHumorIE(),
 522         XVideosIE(),
 523         SoundcloudSetIE(),
 524         SoundcloudIE(),
 525         InfoQIE(),
 526         MixcloudIE(),
 527         StanfordOpenClassroomIE(),
 528         MTVIE(),
 529         YoukuIE(),
 530         XNXXIE(),
 531         YouJizzIE(),
 532         PornotubeIE(),
 533         YouPornIE(),
 534         GooglePlusIE(),
 535         ArteTvIE(),
 536         NBAIE(),
 537         WorldStarHipHopIE(),
 538         JustinTVIE(),
 539         FunnyOrDieIE(),
 540         SteamIE(),
 541         UstreamIE(),
 542         RBMARadioIE(),
 543         EightTracksIE(),
 544         KeekIE(),
 545         TEDIE(),
 546         MySpassIE(),
 547         SpiegelIE(),
 548         LiveLeakIE(),
 549         ARDIE(),
 550         ZDFIE(),
 551         TumblrIE(),
 552         BandcampIE(),
 553         RedTubeIE(),
 554         InaIE(),
 555         HowcastIE(),
 556         VineIE(),
 557         FlickrIE(),
 558         TeamcocoIE(),
 559         XHamsterIE(),
 560         HypemIE(),
 561         Vbox7IE(),
 562         GametrailersIE(),
 563         StatigramIE(),
 564         GenericIE()
 565     ]
 566
 567 def get_info_extractor(ie_name):
 568     """Returns the info extractor class with the given ie_name"""
 569     return globals()[ie_name+'IE']