X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;ds=inline;f=youtube_dl%2FInfoExtractors.py;h=672ef9eedb40b1f8aa7db86e0a20b591c88511f3;hb=e3b7aa8428ba96cd21cfa9824ce8c06df55bfc08;hp=999521feb28bc54f301e9cf75424df98d2649453;hpb=9f4e6bbaeb50fd27f90c799ed8d2531532cfdad7;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 999521feb..672ef9eed 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1,2987 +1,4 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- +# Legacy file for backwards compatibility, use youtube_dl.extractor instead! -from __future__ import absolute_import - -import base64 -import datetime -import itertools -import netrc -import os -import re -import socket -import time -import email.utils -import xml.etree.ElementTree -import random -import math -import operator -import hashlib -import binascii -import urllib - -from .utils import * from .extractor.common import InfoExtractor, SearchInfoExtractor - -from .extractor.ard import ARDIE -from .extractor.arte import ArteTvIE -from .extractor.dailymotion import DailymotionIE -from .extractor.gametrailers import GametrailersIE -from .extractor.metacafe import MetacafeIE -from .extractor.statigram import StatigramIE -from .extractor.photobucket import PhotobucketIE -from .extractor.vimeo import VimeoIE -from .extractor.yahoo import YahooIE -from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE -from .extractor.zdf import ZDFIE - - - - - - - - - - - -class GenericIE(InfoExtractor): - """Generic last-resort information extractor.""" - - _VALID_URL = r'.*' - IE_NAME = u'generic' - - def report_download_webpage(self, video_id): - """Report webpage download.""" - if not self._downloader.params.get('test', False): - self._downloader.report_warning(u'Falling back on generic information extractor.') - super(GenericIE, self).report_download_webpage(video_id) - - def report_following_redirect(self, new_url): - """Report information extraction.""" - self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) - - def _test_redirect(self, url): - """Check if it is a redirect, like url shorteners, in case return the new url.""" - class HeadRequest(compat_urllib_request.Request): - def get_method(self): - return "HEAD" - - class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): - """ - Subclass the HTTPRedirectHandler to make it use our - HeadRequest also on the redirected URL - """ - def redirect_request(self, req, fp, code, msg, headers, newurl): - if code in (301, 302, 303, 307): - newurl = newurl.replace(' ', '%20') - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - return HeadRequest(newurl, - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True) - else: - raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) - - class HTTPMethodFallback(compat_urllib_request.BaseHandler): - """ - Fallback to GET if HEAD is not allowed (405 HTTP error) - """ - def http_error_405(self, req, fp, code, msg, headers): - fp.read() - fp.close() - - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - return self.parent.open(compat_urllib_request.Request(req.get_full_url(), - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True)) - - # Build our opener - opener = compat_urllib_request.OpenerDirector() - for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, - HTTPMethodFallback, HEADRedirectHandler, - compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: - opener.add_handler(handler()) - - response = opener.open(HeadRequest(url)) - if response is None: - raise ExtractorError(u'Invalid URL protocol') - new_url = response.geturl() - - if url == new_url: - return False - - self.report_following_redirect(new_url) - return new_url - - def _real_extract(self, url): - new_url = self._test_redirect(url) - if new_url: return [self.url_result(new_url)] - - video_id = url.split('/')[-1] - try: - webpage = self._download_webpage(url, video_id) - except ValueError as err: - # since this is the last-resort InfoExtractor, if - # this error is thrown, it'll be thrown here - raise ExtractorError(u'Invalid URL: %s' % url) - - self.report_extraction(video_id) - # Start with something easy: JW Player in SWFObject - mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) - if mobj is None: - # Broaden the search a little bit - mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) - if mobj is None: - # Broaden the search a little bit: JWPlayer JS loader - mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage) - if mobj is None: - # Try to find twitter cards info - mobj = re.search(r'(.*)', - webpage, u'video title') - - # video uploader is domain name - video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', - url, u'video uploader') - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'upload_date': None, - 'title': video_title, - 'ext': video_extension, - }] - - - -class GoogleSearchIE(SearchInfoExtractor): - """Information Extractor for Google Video search queries.""" - _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"' - _MAX_RESULTS = 1000 - IE_NAME = u'video.google:search' - _SEARCH_KEY = 'gvsearch' - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - - res = { - '_type': 'playlist', - 'id': query, - 'entries': [] - } - - for pagenum in itertools.count(1): - result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10) - webpage = self._download_webpage(result_url, u'gvsearch:' + query, - note='Downloading result page ' + str(pagenum)) - - for mobj in re.finditer(r'