_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bandcamp import BandcampIE
  24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  25 from .extractor.comedycentral import ComedyCentralIE
  26 from .extractor.collegehumor import CollegeHumorIE
  27 from .extractor.dailymotion import DailymotionIE
  28 from .extractor.depositfiles import DepositFilesIE
  29 from .extractor.eighttracks import EightTracksIE
  30 from .extractor.escapist import EscapistIE
  31 from .extractor.facebook import FacebookIE
  32 from .extractor.funnyordie import FunnyOrDieIE
  33 from .extractor.gametrailers import GametrailersIE
  34 from .extractor.generic import GenericIE
  35 from .extractor.googleplus import GooglePlusIE
  36 from .extractor.googlesearch import GoogleSearchIE
  37 from .extractor.infoq import InfoQIE
  38 from .extractor.justintv import JustinTVIE
  39 from .extractor.keek import KeekIE
  40 from .extractor.liveleak import LiveLeakIE
  41 from .extractor.metacafe import MetacafeIE
  42 from .extractor.mixcloud import MixcloudIE
  43 from .extractor.mtv import MTVIE
  44 from .extractor.myspass import MySpassIE
  45 from .extractor.myvideo import MyVideoIE
  46 from .extractor.nba import NBAIE
  47 from .extractor.statigram import StatigramIE
  48 from .extractor.photobucket import PhotobucketIE
  49 from .extractor.pornotube import PornotubeIE
  50 from .extractor.rbmaradio import RBMARadioIE
  51 from .extractor.redtube import RedTubeIE
  52 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  53 from .extractor.spiegel import SpiegelIE
  54 from .extractor.stanfordoc import StanfordOpenClassroomIE
  55 from .extractor.steam import SteamIE
  56 from .extractor.ted import TEDIE
  57 from .extractor.tumblr import TumblrIE
  58 from .extractor.ustream import UstreamIE
  59 from .extractor.vbox7 import Vbox7IE
  60 from .extractor.vimeo import VimeoIE
  61 from .extractor.vine import VineIE
  62 from .extractor.worldstarhiphop import WorldStarHipHopIE
  63 from .extractor.xnxx import XNXXIE
  64 from .extractor.xvideos import XVideosIE
  65 from .extractor.yahoo import YahooIE, YahooSearchIE
  66 from .extractor.youjizz import YouJizzIE
  67 from .extractor.youku import YoukuIE
  68 from .extractor.youporn import YouPornIE
  69 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  70 from .extractor.zdf import ZDFIE
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106 class InaIE(InfoExtractor):
 107     """Information Extractor for Ina.fr"""
 108     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 109
 110     def _real_extract(self,url):
 111         mobj = re.match(self._VALID_URL, url)
 112
 113         video_id = mobj.group('id')
 114         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 115         video_extension = 'mp4'
 116         webpage = self._download_webpage(mrss_url, video_id)
 117
 118         self.report_extraction(video_id)
 119
 120         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 121             webpage, u'video URL')
 122
 123         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 124             webpage, u'title')
 125
 126         return [{
 127             'id':       video_id,
 128             'url':      video_url,
 129             'ext':      video_extension,
 130             'title':    video_title,
 131         }]
 132
 133 class HowcastIE(InfoExtractor):
 134     """Information Extractor for Howcast.com"""
 135     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 136
 137     def _real_extract(self, url):
 138         mobj = re.match(self._VALID_URL, url)
 139
 140         video_id = mobj.group('id')
 141         webpage_url = 'http://www.howcast.com/videos/' + video_id
 142         webpage = self._download_webpage(webpage_url, video_id)
 143
 144         self.report_extraction(video_id)
 145
 146         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 147             webpage, u'video URL')
 148
 149         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 150             webpage, u'title')
 151
 152         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 153             webpage, u'description', fatal=False)
 154
 155         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 156             webpage, u'thumbnail', fatal=False)
 157
 158         return [{
 159             'id':       video_id,
 160             'url':      video_url,
 161             'ext':      'mp4',
 162             'title':    video_title,
 163             'description': video_description,
 164             'thumbnail': thumbnail,
 165         }]
 166
 167
 168 class FlickrIE(InfoExtractor):
 169     """Information Extractor for Flickr videos"""
 170     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 171
 172     def _real_extract(self, url):
 173         mobj = re.match(self._VALID_URL, url)
 174
 175         video_id = mobj.group('id')
 176         video_uploader_id = mobj.group('uploader_id')
 177         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 178         webpage = self._download_webpage(webpage_url, video_id)
 179
 180         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 181
 182         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 183         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 184
 185         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 186             first_xml, u'node_id')
 187
 188         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 189         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 190
 191         self.report_extraction(video_id)
 192
 193         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 194         if mobj is None:
 195             raise ExtractorError(u'Unable to extract video url')
 196         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 197
 198         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 199             webpage, u'video title')
 200
 201         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 202             webpage, u'description', fatal=False)
 203
 204         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 205             webpage, u'thumbnail', fatal=False)
 206
 207         return [{
 208             'id':          video_id,
 209             'url':         video_url,
 210             'ext':         'mp4',
 211             'title':       video_title,
 212             'description': video_description,
 213             'thumbnail':   thumbnail,
 214             'uploader_id': video_uploader_id,
 215         }]
 216
 217 class TeamcocoIE(InfoExtractor):
 218     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 219
 220     def _real_extract(self, url):
 221         mobj = re.match(self._VALID_URL, url)
 222         if mobj is None:
 223             raise ExtractorError(u'Invalid URL: %s' % url)
 224         url_title = mobj.group('url_title')
 225         webpage = self._download_webpage(url, url_title)
 226
 227         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 228             webpage, u'video id')
 229
 230         self.report_extraction(video_id)
 231
 232         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 233             webpage, u'title')
 234
 235         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 236             webpage, u'thumbnail', fatal=False)
 237
 238         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 239             webpage, u'description', fatal=False)
 240
 241         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 242         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 243
 244         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 245             data, u'video URL')
 246
 247         return [{
 248             'id':          video_id,
 249             'url':         video_url,
 250             'ext':         'mp4',
 251             'title':       video_title,
 252             'thumbnail':   thumbnail,
 253             'description': video_description,
 254         }]
 255
 256 class XHamsterIE(InfoExtractor):
 257     """Information Extractor for xHamster"""
 258     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 259
 260     def _real_extract(self,url):
 261         mobj = re.match(self._VALID_URL, url)
 262
 263         video_id = mobj.group('id')
 264         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 265         webpage = self._download_webpage(mrss_url, video_id)
 266
 267         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 268         if mobj is None:
 269             raise ExtractorError(u'Unable to extract media URL')
 270         if len(mobj.group('server')) == 0:
 271             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 272         else:
 273             video_url = mobj.group('server')+'/key='+mobj.group('file')
 274         video_extension = video_url.split('.')[-1]
 275
 276         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 277             webpage, u'title')
 278
 279         # Can't see the description anywhere in the UI
 280         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 281         #     webpage, u'description', fatal=False)
 282         # if video_description: video_description = unescapeHTML(video_description)
 283
 284         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 285         if mobj:
 286             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 287         else:
 288             video_upload_date = None
 289             self._downloader.report_warning(u'Unable to extract upload date')
 290
 291         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 292             webpage, u'uploader id', default=u'anonymous')
 293
 294         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 295             webpage, u'thumbnail', fatal=False)
 296
 297         return [{
 298             'id':       video_id,
 299             'url':      video_url,
 300             'ext':      video_extension,
 301             'title':    video_title,
 302             # 'description': video_description,
 303             'upload_date': video_upload_date,
 304             'uploader_id': video_uploader_id,
 305             'thumbnail': video_thumbnail
 306         }]
 307
 308 class HypemIE(InfoExtractor):
 309     """Information Extractor for hypem"""
 310     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 311
 312     def _real_extract(self, url):
 313         mobj = re.match(self._VALID_URL, url)
 314         if mobj is None:
 315             raise ExtractorError(u'Invalid URL: %s' % url)
 316         track_id = mobj.group(1)
 317
 318         data = { 'ax': 1, 'ts': time.time() }
 319         data_encoded = compat_urllib_parse.urlencode(data)
 320         complete_url = url + "?" + data_encoded
 321         request = compat_urllib_request.Request(complete_url)
 322         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 323         cookie = urlh.headers.get('Set-Cookie', '')
 324
 325         self.report_extraction(track_id)
 326
 327         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 328             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 329         try:
 330             track_list = json.loads(html_tracks)
 331             track = track_list[u'tracks'][0]
 332         except ValueError:
 333             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 334
 335         key = track[u"key"]
 336         track_id = track[u"id"]
 337         artist = track[u"artist"]
 338         title = track[u"song"]
 339
 340         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 341         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 342         request.add_header('cookie', cookie)
 343         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 344         try:
 345             song_data = json.loads(song_data_json)
 346         except ValueError:
 347             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 348         final_url = song_data[u"url"]
 349
 350         return [{
 351             'id':       track_id,
 352             'url':      final_url,
 353             'ext':      "mp3",
 354             'title':    title,
 355             'artist':   artist,
 356         }]
 357
 358
 359
 360 def gen_extractors():
 361     """ Return a list of an instance of every supported extractor.
 362     The order does matter; the first extractor matched is the one handling the URL.
 363     """
 364     return [
 365         YoutubePlaylistIE(),
 366         YoutubeChannelIE(),
 367         YoutubeUserIE(),
 368         YoutubeSearchIE(),
 369         YoutubeIE(),
 370         MetacafeIE(),
 371         DailymotionIE(),
 372         GoogleSearchIE(),
 373         PhotobucketIE(),
 374         YahooIE(),
 375         YahooSearchIE(),
 376         DepositFilesIE(),
 377         FacebookIE(),
 378         BlipTVIE(),
 379         BlipTVUserIE(),
 380         VimeoIE(),
 381         MyVideoIE(),
 382         ComedyCentralIE(),
 383         EscapistIE(),
 384         CollegeHumorIE(),
 385         XVideosIE(),
 386         SoundcloudSetIE(),
 387         SoundcloudIE(),
 388         InfoQIE(),
 389         MixcloudIE(),
 390         StanfordOpenClassroomIE(),
 391         MTVIE(),
 392         YoukuIE(),
 393         XNXXIE(),
 394         YouJizzIE(),
 395         PornotubeIE(),
 396         YouPornIE(),
 397         GooglePlusIE(),
 398         ArteTvIE(),
 399         NBAIE(),
 400         WorldStarHipHopIE(),
 401         JustinTVIE(),
 402         FunnyOrDieIE(),
 403         SteamIE(),
 404         UstreamIE(),
 405         RBMARadioIE(),
 406         EightTracksIE(),
 407         KeekIE(),
 408         TEDIE(),
 409         MySpassIE(),
 410         SpiegelIE(),
 411         LiveLeakIE(),
 412         ARDIE(),
 413         ZDFIE(),
 414         TumblrIE(),
 415         BandcampIE(),
 416         RedTubeIE(),
 417         InaIE(),
 418         HowcastIE(),
 419         VineIE(),
 420         FlickrIE(),
 421         TeamcocoIE(),
 422         XHamsterIE(),
 423         HypemIE(),
 424         Vbox7IE(),
 425         GametrailersIE(),
 426         StatigramIE(),
 427         GenericIE()
 428     ]
 429
 430 def get_info_extractor(ie_name):
 431     """Returns the info extractor class with the given ie_name"""
 432     return globals()[ie_name+'IE']