_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bandcamp import BandcampIE
  24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  25 from .extractor.comedycentral import ComedyCentralIE
  26 from .extractor.collegehumor import CollegeHumorIE
  27 from .extractor.dailymotion import DailymotionIE
  28 from .extractor.depositfiles import DepositFilesIE
  29 from .extractor.eighttracks import EightTracksIE
  30 from .extractor.escapist import EscapistIE
  31 from .extractor.facebook import FacebookIE
  32 from .extractor.funnyordie import FunnyOrDieIE
  33 from .extractor.gametrailers import GametrailersIE
  34 from .extractor.generic import GenericIE
  35 from .extractor.googleplus import GooglePlusIE
  36 from .extractor.googlesearch import GoogleSearchIE
  37 from .extractor.infoq import InfoQIE
  38 from .extractor.justintv import JustinTVIE
  39 from .extractor.keek import KeekIE
  40 from .extractor.liveleak import LiveLeakIE
  41 from .extractor.metacafe import MetacafeIE
  42 from .extractor.mixcloud import MixcloudIE
  43 from .extractor.mtv import MTVIE
  44 from .extractor.myspass import MySpassIE
  45 from .extractor.myvideo import MyVideoIE
  46 from .extractor.nba import NBAIE
  47 from .extractor.statigram import StatigramIE
  48 from .extractor.photobucket import PhotobucketIE
  49 from .extractor.pornotube import PornotubeIE
  50 from .extractor.rbmaradio import RBMARadioIE
  51 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  52 from .extractor.spiegel import SpiegelIE
  53 from .extractor.stanfordoc import StanfordOpenClassroomIE
  54 from .extractor.steam import SteamIE
  55 from .extractor.ted import TEDIE
  56 from .extractor.tumblr import TumblrIE
  57 from .extractor.ustream import UstreamIE
  58 from .extractor.vimeo import VimeoIE
  59 from .extractor.worldstarhiphop import WorldStarHipHopIE
  60 from .extractor.xnxx import XNXXIE
  61 from .extractor.xvideos import XVideosIE
  62 from .extractor.yahoo import YahooIE, YahooSearchIE
  63 from .extractor.youjizz import YouJizzIE
  64 from .extractor.youku import YoukuIE
  65 from .extractor.youporn import YouPornIE
  66 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  67 from .extractor.zdf import ZDFIE
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103 class RedTubeIE(InfoExtractor):
 104     """Information Extractor for redtube"""
 105     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 106
 107     def _real_extract(self,url):
 108         mobj = re.match(self._VALID_URL, url)
 109         if mobj is None:
 110             raise ExtractorError(u'Invalid URL: %s' % url)
 111
 112         video_id = mobj.group('id')
 113         video_extension = 'mp4'
 114         webpage = self._download_webpage(url, video_id)
 115
 116         self.report_extraction(video_id)
 117
 118         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 119             webpage, u'video URL')
 120
 121         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 122             webpage, u'title')
 123
 124         return [{
 125             'id':       video_id,
 126             'url':      video_url,
 127             'ext':      video_extension,
 128             'title':    video_title,
 129         }]
 130
 131 class InaIE(InfoExtractor):
 132     """Information Extractor for Ina.fr"""
 133     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 134
 135     def _real_extract(self,url):
 136         mobj = re.match(self._VALID_URL, url)
 137
 138         video_id = mobj.group('id')
 139         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 140         video_extension = 'mp4'
 141         webpage = self._download_webpage(mrss_url, video_id)
 142
 143         self.report_extraction(video_id)
 144
 145         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 146             webpage, u'video URL')
 147
 148         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 149             webpage, u'title')
 150
 151         return [{
 152             'id':       video_id,
 153             'url':      video_url,
 154             'ext':      video_extension,
 155             'title':    video_title,
 156         }]
 157
 158 class HowcastIE(InfoExtractor):
 159     """Information Extractor for Howcast.com"""
 160     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 161
 162     def _real_extract(self, url):
 163         mobj = re.match(self._VALID_URL, url)
 164
 165         video_id = mobj.group('id')
 166         webpage_url = 'http://www.howcast.com/videos/' + video_id
 167         webpage = self._download_webpage(webpage_url, video_id)
 168
 169         self.report_extraction(video_id)
 170
 171         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 172             webpage, u'video URL')
 173
 174         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 175             webpage, u'title')
 176
 177         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 178             webpage, u'description', fatal=False)
 179
 180         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 181             webpage, u'thumbnail', fatal=False)
 182
 183         return [{
 184             'id':       video_id,
 185             'url':      video_url,
 186             'ext':      'mp4',
 187             'title':    video_title,
 188             'description': video_description,
 189             'thumbnail': thumbnail,
 190         }]
 191
 192 class VineIE(InfoExtractor):
 193     """Information Extractor for Vine.co"""
 194     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 195
 196     def _real_extract(self, url):
 197         mobj = re.match(self._VALID_URL, url)
 198
 199         video_id = mobj.group('id')
 200         webpage_url = 'https://vine.co/v/' + video_id
 201         webpage = self._download_webpage(webpage_url, video_id)
 202
 203         self.report_extraction(video_id)
 204
 205         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
 206             webpage, u'video URL')
 207
 208         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 209             webpage, u'title')
 210
 211         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
 212             webpage, u'thumbnail', fatal=False)
 213
 214         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
 215             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 216
 217         return [{
 218             'id':        video_id,
 219             'url':       video_url,
 220             'ext':       'mp4',
 221             'title':     video_title,
 222             'thumbnail': thumbnail,
 223             'uploader':  uploader,
 224         }]
 225
 226 class FlickrIE(InfoExtractor):
 227     """Information Extractor for Flickr videos"""
 228     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 229
 230     def _real_extract(self, url):
 231         mobj = re.match(self._VALID_URL, url)
 232
 233         video_id = mobj.group('id')
 234         video_uploader_id = mobj.group('uploader_id')
 235         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 236         webpage = self._download_webpage(webpage_url, video_id)
 237
 238         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 239
 240         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 241         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 242
 243         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 244             first_xml, u'node_id')
 245
 246         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 247         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 248
 249         self.report_extraction(video_id)
 250
 251         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 252         if mobj is None:
 253             raise ExtractorError(u'Unable to extract video url')
 254         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 255
 256         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 257             webpage, u'video title')
 258
 259         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 260             webpage, u'description', fatal=False)
 261
 262         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 263             webpage, u'thumbnail', fatal=False)
 264
 265         return [{
 266             'id':          video_id,
 267             'url':         video_url,
 268             'ext':         'mp4',
 269             'title':       video_title,
 270             'description': video_description,
 271             'thumbnail':   thumbnail,
 272             'uploader_id': video_uploader_id,
 273         }]
 274
 275 class TeamcocoIE(InfoExtractor):
 276     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 277
 278     def _real_extract(self, url):
 279         mobj = re.match(self._VALID_URL, url)
 280         if mobj is None:
 281             raise ExtractorError(u'Invalid URL: %s' % url)
 282         url_title = mobj.group('url_title')
 283         webpage = self._download_webpage(url, url_title)
 284
 285         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 286             webpage, u'video id')
 287
 288         self.report_extraction(video_id)
 289
 290         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 291             webpage, u'title')
 292
 293         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 294             webpage, u'thumbnail', fatal=False)
 295
 296         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 297             webpage, u'description', fatal=False)
 298
 299         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 300         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 301
 302         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 303             data, u'video URL')
 304
 305         return [{
 306             'id':          video_id,
 307             'url':         video_url,
 308             'ext':         'mp4',
 309             'title':       video_title,
 310             'thumbnail':   thumbnail,
 311             'description': video_description,
 312         }]
 313
 314 class XHamsterIE(InfoExtractor):
 315     """Information Extractor for xHamster"""
 316     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 317
 318     def _real_extract(self,url):
 319         mobj = re.match(self._VALID_URL, url)
 320
 321         video_id = mobj.group('id')
 322         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 323         webpage = self._download_webpage(mrss_url, video_id)
 324
 325         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 326         if mobj is None:
 327             raise ExtractorError(u'Unable to extract media URL')
 328         if len(mobj.group('server')) == 0:
 329             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 330         else:
 331             video_url = mobj.group('server')+'/key='+mobj.group('file')
 332         video_extension = video_url.split('.')[-1]
 333
 334         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 335             webpage, u'title')
 336
 337         # Can't see the description anywhere in the UI
 338         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 339         #     webpage, u'description', fatal=False)
 340         # if video_description: video_description = unescapeHTML(video_description)
 341
 342         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 343         if mobj:
 344             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 345         else:
 346             video_upload_date = None
 347             self._downloader.report_warning(u'Unable to extract upload date')
 348
 349         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 350             webpage, u'uploader id', default=u'anonymous')
 351
 352         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 353             webpage, u'thumbnail', fatal=False)
 354
 355         return [{
 356             'id':       video_id,
 357             'url':      video_url,
 358             'ext':      video_extension,
 359             'title':    video_title,
 360             # 'description': video_description,
 361             'upload_date': video_upload_date,
 362             'uploader_id': video_uploader_id,
 363             'thumbnail': video_thumbnail
 364         }]
 365
 366 class HypemIE(InfoExtractor):
 367     """Information Extractor for hypem"""
 368     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 369
 370     def _real_extract(self, url):
 371         mobj = re.match(self._VALID_URL, url)
 372         if mobj is None:
 373             raise ExtractorError(u'Invalid URL: %s' % url)
 374         track_id = mobj.group(1)
 375
 376         data = { 'ax': 1, 'ts': time.time() }
 377         data_encoded = compat_urllib_parse.urlencode(data)
 378         complete_url = url + "?" + data_encoded
 379         request = compat_urllib_request.Request(complete_url)
 380         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 381         cookie = urlh.headers.get('Set-Cookie', '')
 382
 383         self.report_extraction(track_id)
 384
 385         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 386             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 387         try:
 388             track_list = json.loads(html_tracks)
 389             track = track_list[u'tracks'][0]
 390         except ValueError:
 391             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 392
 393         key = track[u"key"]
 394         track_id = track[u"id"]
 395         artist = track[u"artist"]
 396         title = track[u"song"]
 397
 398         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 399         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 400         request.add_header('cookie', cookie)
 401         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 402         try:
 403             song_data = json.loads(song_data_json)
 404         except ValueError:
 405             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 406         final_url = song_data[u"url"]
 407
 408         return [{
 409             'id':       track_id,
 410             'url':      final_url,
 411             'ext':      "mp3",
 412             'title':    title,
 413             'artist':   artist,
 414         }]
 415
 416 class Vbox7IE(InfoExtractor):
 417     """Information Extractor for Vbox7"""
 418     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
 419
 420     def _real_extract(self,url):
 421         mobj = re.match(self._VALID_URL, url)
 422         if mobj is None:
 423             raise ExtractorError(u'Invalid URL: %s' % url)
 424         video_id = mobj.group(1)
 425
 426         redirect_page, urlh = self._download_webpage_handle(url, video_id)
 427         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
 428         redirect_url = urlh.geturl() + new_location
 429         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
 430
 431         title = self._html_search_regex(r'<title>(.*)</title>',
 432             webpage, u'title').split('/')[0].strip()
 433
 434         ext = "flv"
 435         info_url = "http://vbox7.com/play/magare.do"
 436         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
 437         info_request = compat_urllib_request.Request(info_url, data)
 438         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 439         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
 440         if info_response is None:
 441             raise ExtractorError(u'Unable to extract the media url')
 442         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
 443
 444         return [{
 445             'id':        video_id,
 446             'url':       final_url,
 447             'ext':       ext,
 448             'title':     title,
 449             'thumbnail': thumbnail_url,
 450         }]
 451
 452
 453 def gen_extractors():
 454     """ Return a list of an instance of every supported extractor.
 455     The order does matter; the first extractor matched is the one handling the URL.
 456     """
 457     return [
 458         YoutubePlaylistIE(),
 459         YoutubeChannelIE(),
 460         YoutubeUserIE(),
 461         YoutubeSearchIE(),
 462         YoutubeIE(),
 463         MetacafeIE(),
 464         DailymotionIE(),
 465         GoogleSearchIE(),
 466         PhotobucketIE(),
 467         YahooIE(),
 468         YahooSearchIE(),
 469         DepositFilesIE(),
 470         FacebookIE(),
 471         BlipTVIE(),
 472         BlipTVUserIE(),
 473         VimeoIE(),
 474         MyVideoIE(),
 475         ComedyCentralIE(),
 476         EscapistIE(),
 477         CollegeHumorIE(),
 478         XVideosIE(),
 479         SoundcloudSetIE(),
 480         SoundcloudIE(),
 481         InfoQIE(),
 482         MixcloudIE(),
 483         StanfordOpenClassroomIE(),
 484         MTVIE(),
 485         YoukuIE(),
 486         XNXXIE(),
 487         YouJizzIE(),
 488         PornotubeIE(),
 489         YouPornIE(),
 490         GooglePlusIE(),
 491         ArteTvIE(),
 492         NBAIE(),
 493         WorldStarHipHopIE(),
 494         JustinTVIE(),
 495         FunnyOrDieIE(),
 496         SteamIE(),
 497         UstreamIE(),
 498         RBMARadioIE(),
 499         EightTracksIE(),
 500         KeekIE(),
 501         TEDIE(),
 502         MySpassIE(),
 503         SpiegelIE(),
 504         LiveLeakIE(),
 505         ARDIE(),
 506         ZDFIE(),
 507         TumblrIE(),
 508         BandcampIE(),
 509         RedTubeIE(),
 510         InaIE(),
 511         HowcastIE(),
 512         VineIE(),
 513         FlickrIE(),
 514         TeamcocoIE(),
 515         XHamsterIE(),
 516         HypemIE(),
 517         Vbox7IE(),
 518         GametrailersIE(),
 519         StatigramIE(),
 520         GenericIE()
 521     ]
 522
 523 def get_info_extractor(ie_name):
 524     """Returns the info extractor class with the given ie_name"""
 525     return globals()[ie_name+'IE']