_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bandcamp import BandcampIE
  24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  25 from .extractor.comedycentral import ComedyCentralIE
  26 from .extractor.collegehumor import CollegeHumorIE
  27 from .extractor.dailymotion import DailymotionIE
  28 from .extractor.depositfiles import DepositFilesIE
  29 from .extractor.eighttracks import EightTracksIE
  30 from .extractor.escapist import EscapistIE
  31 from .extractor.facebook import FacebookIE
  32 from .extractor.funnyordie import FunnyOrDieIE
  33 from .extractor.gametrailers import GametrailersIE
  34 from .extractor.generic import GenericIE
  35 from .extractor.googleplus import GooglePlusIE
  36 from .extractor.googlesearch import GoogleSearchIE
  37 from .extractor.ina import InaIE
  38 from .extractor.infoq import InfoQIE
  39 from .extractor.justintv import JustinTVIE
  40 from .extractor.keek import KeekIE
  41 from .extractor.liveleak import LiveLeakIE
  42 from .extractor.metacafe import MetacafeIE
  43 from .extractor.mixcloud import MixcloudIE
  44 from .extractor.mtv import MTVIE
  45 from .extractor.myspass import MySpassIE
  46 from .extractor.myvideo import MyVideoIE
  47 from .extractor.nba import NBAIE
  48 from .extractor.statigram import StatigramIE
  49 from .extractor.photobucket import PhotobucketIE
  50 from .extractor.pornotube import PornotubeIE
  51 from .extractor.rbmaradio import RBMARadioIE
  52 from .extractor.redtube import RedTubeIE
  53 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  54 from .extractor.spiegel import SpiegelIE
  55 from .extractor.stanfordoc import StanfordOpenClassroomIE
  56 from .extractor.steam import SteamIE
  57 from .extractor.ted import TEDIE
  58 from .extractor.tumblr import TumblrIE
  59 from .extractor.ustream import UstreamIE
  60 from .extractor.vbox7 import Vbox7IE
  61 from .extractor.vimeo import VimeoIE
  62 from .extractor.vine import VineIE
  63 from .extractor.worldstarhiphop import WorldStarHipHopIE
  64 from .extractor.xnxx import XNXXIE
  65 from .extractor.xvideos import XVideosIE
  66 from .extractor.yahoo import YahooIE, YahooSearchIE
  67 from .extractor.youjizz import YouJizzIE
  68 from .extractor.youku import YoukuIE
  69 from .extractor.youporn import YouPornIE
  70 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  71 from .extractor.zdf import ZDFIE
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108 class HowcastIE(InfoExtractor):
 109     """Information Extractor for Howcast.com"""
 110     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 111
 112     def _real_extract(self, url):
 113         mobj = re.match(self._VALID_URL, url)
 114
 115         video_id = mobj.group('id')
 116         webpage_url = 'http://www.howcast.com/videos/' + video_id
 117         webpage = self._download_webpage(webpage_url, video_id)
 118
 119         self.report_extraction(video_id)
 120
 121         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 122             webpage, u'video URL')
 123
 124         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 125             webpage, u'title')
 126
 127         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 128             webpage, u'description', fatal=False)
 129
 130         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 131             webpage, u'thumbnail', fatal=False)
 132
 133         return [{
 134             'id':       video_id,
 135             'url':      video_url,
 136             'ext':      'mp4',
 137             'title':    video_title,
 138             'description': video_description,
 139             'thumbnail': thumbnail,
 140         }]
 141
 142
 143 class FlickrIE(InfoExtractor):
 144     """Information Extractor for Flickr videos"""
 145     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 146
 147     def _real_extract(self, url):
 148         mobj = re.match(self._VALID_URL, url)
 149
 150         video_id = mobj.group('id')
 151         video_uploader_id = mobj.group('uploader_id')
 152         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 153         webpage = self._download_webpage(webpage_url, video_id)
 154
 155         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 156
 157         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 158         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 159
 160         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 161             first_xml, u'node_id')
 162
 163         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 164         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 165
 166         self.report_extraction(video_id)
 167
 168         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 169         if mobj is None:
 170             raise ExtractorError(u'Unable to extract video url')
 171         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 172
 173         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 174             webpage, u'video title')
 175
 176         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 177             webpage, u'description', fatal=False)
 178
 179         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 180             webpage, u'thumbnail', fatal=False)
 181
 182         return [{
 183             'id':          video_id,
 184             'url':         video_url,
 185             'ext':         'mp4',
 186             'title':       video_title,
 187             'description': video_description,
 188             'thumbnail':   thumbnail,
 189             'uploader_id': video_uploader_id,
 190         }]
 191
 192 class TeamcocoIE(InfoExtractor):
 193     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 194
 195     def _real_extract(self, url):
 196         mobj = re.match(self._VALID_URL, url)
 197         if mobj is None:
 198             raise ExtractorError(u'Invalid URL: %s' % url)
 199         url_title = mobj.group('url_title')
 200         webpage = self._download_webpage(url, url_title)
 201
 202         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 203             webpage, u'video id')
 204
 205         self.report_extraction(video_id)
 206
 207         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 208             webpage, u'title')
 209
 210         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 211             webpage, u'thumbnail', fatal=False)
 212
 213         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 214             webpage, u'description', fatal=False)
 215
 216         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 217         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 218
 219         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 220             data, u'video URL')
 221
 222         return [{
 223             'id':          video_id,
 224             'url':         video_url,
 225             'ext':         'mp4',
 226             'title':       video_title,
 227             'thumbnail':   thumbnail,
 228             'description': video_description,
 229         }]
 230
 231 class XHamsterIE(InfoExtractor):
 232     """Information Extractor for xHamster"""
 233     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 234
 235     def _real_extract(self,url):
 236         mobj = re.match(self._VALID_URL, url)
 237
 238         video_id = mobj.group('id')
 239         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 240         webpage = self._download_webpage(mrss_url, video_id)
 241
 242         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 243         if mobj is None:
 244             raise ExtractorError(u'Unable to extract media URL')
 245         if len(mobj.group('server')) == 0:
 246             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 247         else:
 248             video_url = mobj.group('server')+'/key='+mobj.group('file')
 249         video_extension = video_url.split('.')[-1]
 250
 251         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 252             webpage, u'title')
 253
 254         # Can't see the description anywhere in the UI
 255         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 256         #     webpage, u'description', fatal=False)
 257         # if video_description: video_description = unescapeHTML(video_description)
 258
 259         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 260         if mobj:
 261             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 262         else:
 263             video_upload_date = None
 264             self._downloader.report_warning(u'Unable to extract upload date')
 265
 266         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 267             webpage, u'uploader id', default=u'anonymous')
 268
 269         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 270             webpage, u'thumbnail', fatal=False)
 271
 272         return [{
 273             'id':       video_id,
 274             'url':      video_url,
 275             'ext':      video_extension,
 276             'title':    video_title,
 277             # 'description': video_description,
 278             'upload_date': video_upload_date,
 279             'uploader_id': video_uploader_id,
 280             'thumbnail': video_thumbnail
 281         }]
 282
 283 class HypemIE(InfoExtractor):
 284     """Information Extractor for hypem"""
 285     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 286
 287     def _real_extract(self, url):
 288         mobj = re.match(self._VALID_URL, url)
 289         if mobj is None:
 290             raise ExtractorError(u'Invalid URL: %s' % url)
 291         track_id = mobj.group(1)
 292
 293         data = { 'ax': 1, 'ts': time.time() }
 294         data_encoded = compat_urllib_parse.urlencode(data)
 295         complete_url = url + "?" + data_encoded
 296         request = compat_urllib_request.Request(complete_url)
 297         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 298         cookie = urlh.headers.get('Set-Cookie', '')
 299
 300         self.report_extraction(track_id)
 301
 302         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 303             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 304         try:
 305             track_list = json.loads(html_tracks)
 306             track = track_list[u'tracks'][0]
 307         except ValueError:
 308             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 309
 310         key = track[u"key"]
 311         track_id = track[u"id"]
 312         artist = track[u"artist"]
 313         title = track[u"song"]
 314
 315         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 316         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 317         request.add_header('cookie', cookie)
 318         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 319         try:
 320             song_data = json.loads(song_data_json)
 321         except ValueError:
 322             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 323         final_url = song_data[u"url"]
 324
 325         return [{
 326             'id':       track_id,
 327             'url':      final_url,
 328             'ext':      "mp3",
 329             'title':    title,
 330             'artist':   artist,
 331         }]
 332
 333
 334
 335 def gen_extractors():
 336     """ Return a list of an instance of every supported extractor.
 337     The order does matter; the first extractor matched is the one handling the URL.
 338     """
 339     return [
 340         YoutubePlaylistIE(),
 341         YoutubeChannelIE(),
 342         YoutubeUserIE(),
 343         YoutubeSearchIE(),
 344         YoutubeIE(),
 345         MetacafeIE(),
 346         DailymotionIE(),
 347         GoogleSearchIE(),
 348         PhotobucketIE(),
 349         YahooIE(),
 350         YahooSearchIE(),
 351         DepositFilesIE(),
 352         FacebookIE(),
 353         BlipTVIE(),
 354         BlipTVUserIE(),
 355         VimeoIE(),
 356         MyVideoIE(),
 357         ComedyCentralIE(),
 358         EscapistIE(),
 359         CollegeHumorIE(),
 360         XVideosIE(),
 361         SoundcloudSetIE(),
 362         SoundcloudIE(),
 363         InfoQIE(),
 364         MixcloudIE(),
 365         StanfordOpenClassroomIE(),
 366         MTVIE(),
 367         YoukuIE(),
 368         XNXXIE(),
 369         YouJizzIE(),
 370         PornotubeIE(),
 371         YouPornIE(),
 372         GooglePlusIE(),
 373         ArteTvIE(),
 374         NBAIE(),
 375         WorldStarHipHopIE(),
 376         JustinTVIE(),
 377         FunnyOrDieIE(),
 378         SteamIE(),
 379         UstreamIE(),
 380         RBMARadioIE(),
 381         EightTracksIE(),
 382         KeekIE(),
 383         TEDIE(),
 384         MySpassIE(),
 385         SpiegelIE(),
 386         LiveLeakIE(),
 387         ARDIE(),
 388         ZDFIE(),
 389         TumblrIE(),
 390         BandcampIE(),
 391         RedTubeIE(),
 392         InaIE(),
 393         HowcastIE(),
 394         VineIE(),
 395         FlickrIE(),
 396         TeamcocoIE(),
 397         XHamsterIE(),
 398         HypemIE(),
 399         Vbox7IE(),
 400         GametrailersIE(),
 401         StatigramIE(),
 402         GenericIE()
 403     ]
 404
 405 def get_info_extractor(ie_name):
 406     """Returns the info extractor class with the given ie_name"""
 407     return globals()[ie_name+'IE']