_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bandcamp import BandcampIE
  24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  25 from .extractor.comedycentral import ComedyCentralIE
  26 from .extractor.collegehumor import CollegeHumorIE
  27 from .extractor.dailymotion import DailymotionIE
  28 from .extractor.depositfiles import DepositFilesIE
  29 from .extractor.eighttracks import EightTracksIE
  30 from .extractor.escapist import EscapistIE
  31 from .extractor.facebook import FacebookIE
  32 from .extractor.funnyordie import FunnyOrDieIE
  33 from .extractor.gametrailers import GametrailersIE
  34 from .extractor.generic import GenericIE
  35 from .extractor.googleplus import GooglePlusIE
  36 from .extractor.googlesearch import GoogleSearchIE
  37 from .extractor.hypem import HypemIE
  38 from .extractor.ina import InaIE
  39 from .extractor.infoq import InfoQIE
  40 from .extractor.justintv import JustinTVIE
  41 from .extractor.keek import KeekIE
  42 from .extractor.liveleak import LiveLeakIE
  43 from .extractor.metacafe import MetacafeIE
  44 from .extractor.mixcloud import MixcloudIE
  45 from .extractor.mtv import MTVIE
  46 from .extractor.myspass import MySpassIE
  47 from .extractor.myvideo import MyVideoIE
  48 from .extractor.nba import NBAIE
  49 from .extractor.statigram import StatigramIE
  50 from .extractor.photobucket import PhotobucketIE
  51 from .extractor.pornotube import PornotubeIE
  52 from .extractor.rbmaradio import RBMARadioIE
  53 from .extractor.redtube import RedTubeIE
  54 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  55 from .extractor.spiegel import SpiegelIE
  56 from .extractor.stanfordoc import StanfordOpenClassroomIE
  57 from .extractor.steam import SteamIE
  58 from .extractor.ted import TEDIE
  59 from .extractor.tumblr import TumblrIE
  60 from .extractor.ustream import UstreamIE
  61 from .extractor.vbox7 import Vbox7IE
  62 from .extractor.vimeo import VimeoIE
  63 from .extractor.vine import VineIE
  64 from .extractor.worldstarhiphop import WorldStarHipHopIE
  65 from .extractor.xnxx import XNXXIE
  66 from .extractor.xvideos import XVideosIE
  67 from .extractor.yahoo import YahooIE, YahooSearchIE
  68 from .extractor.youjizz import YouJizzIE
  69 from .extractor.youku import YoukuIE
  70 from .extractor.youporn import YouPornIE
  71 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  72 from .extractor.zdf import ZDFIE
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109 class HowcastIE(InfoExtractor):
 110     """Information Extractor for Howcast.com"""
 111     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 112
 113     def _real_extract(self, url):
 114         mobj = re.match(self._VALID_URL, url)
 115
 116         video_id = mobj.group('id')
 117         webpage_url = 'http://www.howcast.com/videos/' + video_id
 118         webpage = self._download_webpage(webpage_url, video_id)
 119
 120         self.report_extraction(video_id)
 121
 122         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 123             webpage, u'video URL')
 124
 125         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 126             webpage, u'title')
 127
 128         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 129             webpage, u'description', fatal=False)
 130
 131         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 132             webpage, u'thumbnail', fatal=False)
 133
 134         return [{
 135             'id':       video_id,
 136             'url':      video_url,
 137             'ext':      'mp4',
 138             'title':    video_title,
 139             'description': video_description,
 140             'thumbnail': thumbnail,
 141         }]
 142
 143
 144 class FlickrIE(InfoExtractor):
 145     """Information Extractor for Flickr videos"""
 146     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 147
 148     def _real_extract(self, url):
 149         mobj = re.match(self._VALID_URL, url)
 150
 151         video_id = mobj.group('id')
 152         video_uploader_id = mobj.group('uploader_id')
 153         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 154         webpage = self._download_webpage(webpage_url, video_id)
 155
 156         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 157
 158         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 159         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 160
 161         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 162             first_xml, u'node_id')
 163
 164         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 165         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 166
 167         self.report_extraction(video_id)
 168
 169         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 170         if mobj is None:
 171             raise ExtractorError(u'Unable to extract video url')
 172         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 173
 174         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 175             webpage, u'video title')
 176
 177         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 178             webpage, u'description', fatal=False)
 179
 180         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 181             webpage, u'thumbnail', fatal=False)
 182
 183         return [{
 184             'id':          video_id,
 185             'url':         video_url,
 186             'ext':         'mp4',
 187             'title':       video_title,
 188             'description': video_description,
 189             'thumbnail':   thumbnail,
 190             'uploader_id': video_uploader_id,
 191         }]
 192
 193 class TeamcocoIE(InfoExtractor):
 194     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 195
 196     def _real_extract(self, url):
 197         mobj = re.match(self._VALID_URL, url)
 198         if mobj is None:
 199             raise ExtractorError(u'Invalid URL: %s' % url)
 200         url_title = mobj.group('url_title')
 201         webpage = self._download_webpage(url, url_title)
 202
 203         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 204             webpage, u'video id')
 205
 206         self.report_extraction(video_id)
 207
 208         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 209             webpage, u'title')
 210
 211         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 212             webpage, u'thumbnail', fatal=False)
 213
 214         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 215             webpage, u'description', fatal=False)
 216
 217         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 218         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 219
 220         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 221             data, u'video URL')
 222
 223         return [{
 224             'id':          video_id,
 225             'url':         video_url,
 226             'ext':         'mp4',
 227             'title':       video_title,
 228             'thumbnail':   thumbnail,
 229             'description': video_description,
 230         }]
 231
 232 class XHamsterIE(InfoExtractor):
 233     """Information Extractor for xHamster"""
 234     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 235
 236     def _real_extract(self,url):
 237         mobj = re.match(self._VALID_URL, url)
 238
 239         video_id = mobj.group('id')
 240         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 241         webpage = self._download_webpage(mrss_url, video_id)
 242
 243         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 244         if mobj is None:
 245             raise ExtractorError(u'Unable to extract media URL')
 246         if len(mobj.group('server')) == 0:
 247             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 248         else:
 249             video_url = mobj.group('server')+'/key='+mobj.group('file')
 250         video_extension = video_url.split('.')[-1]
 251
 252         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 253             webpage, u'title')
 254
 255         # Can't see the description anywhere in the UI
 256         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 257         #     webpage, u'description', fatal=False)
 258         # if video_description: video_description = unescapeHTML(video_description)
 259
 260         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 261         if mobj:
 262             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 263         else:
 264             video_upload_date = None
 265             self._downloader.report_warning(u'Unable to extract upload date')
 266
 267         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 268             webpage, u'uploader id', default=u'anonymous')
 269
 270         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 271             webpage, u'thumbnail', fatal=False)
 272
 273         return [{
 274             'id':       video_id,
 275             'url':      video_url,
 276             'ext':      video_extension,
 277             'title':    video_title,
 278             # 'description': video_description,
 279             'upload_date': video_upload_date,
 280             'uploader_id': video_uploader_id,
 281             'thumbnail': video_thumbnail
 282         }]
 283
 284
 285
 286
 287
 288 def gen_extractors():
 289     """ Return a list of an instance of every supported extractor.
 290     The order does matter; the first extractor matched is the one handling the URL.
 291     """
 292     return [
 293         YoutubePlaylistIE(),
 294         YoutubeChannelIE(),
 295         YoutubeUserIE(),
 296         YoutubeSearchIE(),
 297         YoutubeIE(),
 298         MetacafeIE(),
 299         DailymotionIE(),
 300         GoogleSearchIE(),
 301         PhotobucketIE(),
 302         YahooIE(),
 303         YahooSearchIE(),
 304         DepositFilesIE(),
 305         FacebookIE(),
 306         BlipTVIE(),
 307         BlipTVUserIE(),
 308         VimeoIE(),
 309         MyVideoIE(),
 310         ComedyCentralIE(),
 311         EscapistIE(),
 312         CollegeHumorIE(),
 313         XVideosIE(),
 314         SoundcloudSetIE(),
 315         SoundcloudIE(),
 316         InfoQIE(),
 317         MixcloudIE(),
 318         StanfordOpenClassroomIE(),
 319         MTVIE(),
 320         YoukuIE(),
 321         XNXXIE(),
 322         YouJizzIE(),
 323         PornotubeIE(),
 324         YouPornIE(),
 325         GooglePlusIE(),
 326         ArteTvIE(),
 327         NBAIE(),
 328         WorldStarHipHopIE(),
 329         JustinTVIE(),
 330         FunnyOrDieIE(),
 331         SteamIE(),
 332         UstreamIE(),
 333         RBMARadioIE(),
 334         EightTracksIE(),
 335         KeekIE(),
 336         TEDIE(),
 337         MySpassIE(),
 338         SpiegelIE(),
 339         LiveLeakIE(),
 340         ARDIE(),
 341         ZDFIE(),
 342         TumblrIE(),
 343         BandcampIE(),
 344         RedTubeIE(),
 345         InaIE(),
 346         HowcastIE(),
 347         VineIE(),
 348         FlickrIE(),
 349         TeamcocoIE(),
 350         XHamsterIE(),
 351         HypemIE(),
 352         Vbox7IE(),
 353         GametrailersIE(),
 354         StatigramIE(),
 355         GenericIE()
 356     ]
 357
 358 def get_info_extractor(ie_name):
 359     """Returns the info extractor class with the given ie_name"""
 360     return globals()[ie_name+'IE']