_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bandcamp import BandcampIE
  24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  25 from .extractor.comedycentral import ComedyCentralIE
  26 from .extractor.collegehumor import CollegeHumorIE
  27 from .extractor.dailymotion import DailymotionIE
  28 from .extractor.depositfiles import DepositFilesIE
  29 from .extractor.eighttracks import EightTracksIE
  30 from .extractor.escapist import EscapistIE
  31 from .extractor.facebook import FacebookIE
  32 from .extractor.flickr import FlickrIE
  33 from .extractor.funnyordie import FunnyOrDieIE
  34 from .extractor.gametrailers import GametrailersIE
  35 from .extractor.generic import GenericIE
  36 from .extractor.googleplus import GooglePlusIE
  37 from .extractor.googlesearch import GoogleSearchIE
  38 from .extractor.howcast import HowcastIE
  39 from .extractor.hypem import HypemIE
  40 from .extractor.ina import InaIE
  41 from .extractor.infoq import InfoQIE
  42 from .extractor.justintv import JustinTVIE
  43 from .extractor.keek import KeekIE
  44 from .extractor.liveleak import LiveLeakIE
  45 from .extractor.metacafe import MetacafeIE
  46 from .extractor.mixcloud import MixcloudIE
  47 from .extractor.mtv import MTVIE
  48 from .extractor.myspass import MySpassIE
  49 from .extractor.myvideo import MyVideoIE
  50 from .extractor.nba import NBAIE
  51 from .extractor.statigram import StatigramIE
  52 from .extractor.photobucket import PhotobucketIE
  53 from .extractor.pornotube import PornotubeIE
  54 from .extractor.rbmaradio import RBMARadioIE
  55 from .extractor.redtube import RedTubeIE
  56 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  57 from .extractor.spiegel import SpiegelIE
  58 from .extractor.stanfordoc import StanfordOpenClassroomIE
  59 from .extractor.steam import SteamIE
  60 from .extractor.ted import TEDIE
  61 from .extractor.tumblr import TumblrIE
  62 from .extractor.ustream import UstreamIE
  63 from .extractor.vbox7 import Vbox7IE
  64 from .extractor.vimeo import VimeoIE
  65 from .extractor.vine import VineIE
  66 from .extractor.worldstarhiphop import WorldStarHipHopIE
  67 from .extractor.xnxx import XNXXIE
  68 from .extractor.xvideos import XVideosIE
  69 from .extractor.yahoo import YahooIE, YahooSearchIE
  70 from .extractor.youjizz import YouJizzIE
  71 from .extractor.youku import YoukuIE
  72 from .extractor.youporn import YouPornIE
  73 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  74 from .extractor.zdf import ZDFIE
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114 class TeamcocoIE(InfoExtractor):
 115     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 116
 117     def _real_extract(self, url):
 118         mobj = re.match(self._VALID_URL, url)
 119         if mobj is None:
 120             raise ExtractorError(u'Invalid URL: %s' % url)
 121         url_title = mobj.group('url_title')
 122         webpage = self._download_webpage(url, url_title)
 123
 124         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 125             webpage, u'video id')
 126
 127         self.report_extraction(video_id)
 128
 129         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 130             webpage, u'title')
 131
 132         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 133             webpage, u'thumbnail', fatal=False)
 134
 135         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 136             webpage, u'description', fatal=False)
 137
 138         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 139         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 140
 141         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 142             data, u'video URL')
 143
 144         return [{
 145             'id':          video_id,
 146             'url':         video_url,
 147             'ext':         'mp4',
 148             'title':       video_title,
 149             'thumbnail':   thumbnail,
 150             'description': video_description,
 151         }]
 152
 153 class XHamsterIE(InfoExtractor):
 154     """Information Extractor for xHamster"""
 155     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 156
 157     def _real_extract(self,url):
 158         mobj = re.match(self._VALID_URL, url)
 159
 160         video_id = mobj.group('id')
 161         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 162         webpage = self._download_webpage(mrss_url, video_id)
 163
 164         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 165         if mobj is None:
 166             raise ExtractorError(u'Unable to extract media URL')
 167         if len(mobj.group('server')) == 0:
 168             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 169         else:
 170             video_url = mobj.group('server')+'/key='+mobj.group('file')
 171         video_extension = video_url.split('.')[-1]
 172
 173         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 174             webpage, u'title')
 175
 176         # Can't see the description anywhere in the UI
 177         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 178         #     webpage, u'description', fatal=False)
 179         # if video_description: video_description = unescapeHTML(video_description)
 180
 181         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 182         if mobj:
 183             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 184         else:
 185             video_upload_date = None
 186             self._downloader.report_warning(u'Unable to extract upload date')
 187
 188         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 189             webpage, u'uploader id', default=u'anonymous')
 190
 191         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 192             webpage, u'thumbnail', fatal=False)
 193
 194         return [{
 195             'id':       video_id,
 196             'url':      video_url,
 197             'ext':      video_extension,
 198             'title':    video_title,
 199             # 'description': video_description,
 200             'upload_date': video_upload_date,
 201             'uploader_id': video_uploader_id,
 202             'thumbnail': video_thumbnail
 203         }]
 204
 205
 206
 207
 208
 209 def gen_extractors():
 210     """ Return a list of an instance of every supported extractor.
 211     The order does matter; the first extractor matched is the one handling the URL.
 212     """
 213     return [
 214         YoutubePlaylistIE(),
 215         YoutubeChannelIE(),
 216         YoutubeUserIE(),
 217         YoutubeSearchIE(),
 218         YoutubeIE(),
 219         MetacafeIE(),
 220         DailymotionIE(),
 221         GoogleSearchIE(),
 222         PhotobucketIE(),
 223         YahooIE(),
 224         YahooSearchIE(),
 225         DepositFilesIE(),
 226         FacebookIE(),
 227         BlipTVIE(),
 228         BlipTVUserIE(),
 229         VimeoIE(),
 230         MyVideoIE(),
 231         ComedyCentralIE(),
 232         EscapistIE(),
 233         CollegeHumorIE(),
 234         XVideosIE(),
 235         SoundcloudSetIE(),
 236         SoundcloudIE(),
 237         InfoQIE(),
 238         MixcloudIE(),
 239         StanfordOpenClassroomIE(),
 240         MTVIE(),
 241         YoukuIE(),
 242         XNXXIE(),
 243         YouJizzIE(),
 244         PornotubeIE(),
 245         YouPornIE(),
 246         GooglePlusIE(),
 247         ArteTvIE(),
 248         NBAIE(),
 249         WorldStarHipHopIE(),
 250         JustinTVIE(),
 251         FunnyOrDieIE(),
 252         SteamIE(),
 253         UstreamIE(),
 254         RBMARadioIE(),
 255         EightTracksIE(),
 256         KeekIE(),
 257         TEDIE(),
 258         MySpassIE(),
 259         SpiegelIE(),
 260         LiveLeakIE(),
 261         ARDIE(),
 262         ZDFIE(),
 263         TumblrIE(),
 264         BandcampIE(),
 265         RedTubeIE(),
 266         InaIE(),
 267         HowcastIE(),
 268         VineIE(),
 269         FlickrIE(),
 270         TeamcocoIE(),
 271         XHamsterIE(),
 272         HypemIE(),
 273         Vbox7IE(),
 274         GametrailersIE(),
 275         StatigramIE(),
 276         GenericIE()
 277     ]
 278
 279 def get_info_extractor(ie_name):
 280     """Returns the info extractor class with the given ie_name"""
 281     return globals()[ie_name+'IE']