Remove useless headers
[youtube-dl] / youtube_dl / InfoExtractors.py
index acf11a96075ff875484c163eb20689e7ae32d535..26fc656905273ae37d4f68d4f85769b3a9bb3434 100755 (executable)
@@ -1,8 +1,3 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from __future__ import absolute_import
-
 import base64
 import datetime
 import itertools
@@ -26,6 +21,8 @@ from .extractor.common import InfoExtractor, SearchInfoExtractor
 from .extractor.ard import ARDIE
 from .extractor.arte import ArteTvIE
 from .extractor.dailymotion import DailymotionIE
+from .extractor.gametrailers import GametrailersIE
+from .extractor.generic import GenericIE
 from .extractor.metacafe import MetacafeIE
 from .extractor.statigram import StatigramIE
 from .extractor.photobucket import PhotobucketIE
@@ -44,178 +41,9 @@ from .extractor.zdf import ZDFIE
 
 
 
-class GenericIE(InfoExtractor):
-    """Generic last-resort information extractor."""
-
-    _VALID_URL = r'.*'
-    IE_NAME = u'generic'
-
-    def report_download_webpage(self, video_id):
-        """Report webpage download."""
-        if not self._downloader.params.get('test', False):
-            self._downloader.report_warning(u'Falling back on generic information extractor.')
-        super(GenericIE, self).report_download_webpage(video_id)
-
-    def report_following_redirect(self, new_url):
-        """Report information extraction."""
-        self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
-
-    def _test_redirect(self, url):
-        """Check if it is a redirect, like url shorteners, in case return the new url."""
-        class HeadRequest(compat_urllib_request.Request):
-            def get_method(self):
-                return "HEAD"
-
-        class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
-            """
-            Subclass the HTTPRedirectHandler to make it use our
-            HeadRequest also on the redirected URL
-            """
-            def redirect_request(self, req, fp, code, msg, headers, newurl):
-                if code in (301, 302, 303, 307):
-                    newurl = newurl.replace(' ', '%20')
-                    newheaders = dict((k,v) for k,v in req.headers.items()
-                                      if k.lower() not in ("content-length", "content-type"))
-                    return HeadRequest(newurl,
-                                       headers=newheaders,
-                                       origin_req_host=req.get_origin_req_host(),
-                                       unverifiable=True)
-                else:
-                    raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
-
-        class HTTPMethodFallback(compat_urllib_request.BaseHandler):
-            """
-            Fallback to GET if HEAD is not allowed (405 HTTP error)
-            """
-            def http_error_405(self, req, fp, code, msg, headers):
-                fp.read()
-                fp.close()
-
-                newheaders = dict((k,v) for k,v in req.headers.items()
-                                  if k.lower() not in ("content-length", "content-type"))
-                return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
-                                                 headers=newheaders,
-                                                 origin_req_host=req.get_origin_req_host(),
-                                                 unverifiable=True))
-
-        # Build our opener
-        opener = compat_urllib_request.OpenerDirector()
-        for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
-                        HTTPMethodFallback, HEADRedirectHandler,
-                        compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
-            opener.add_handler(handler())
-
-        response = opener.open(HeadRequest(url))
-        if response is None:
-            raise ExtractorError(u'Invalid URL protocol')
-        new_url = response.geturl()
-
-        if url == new_url:
-            return False
-
-        self.report_following_redirect(new_url)
-        return new_url
-
-    def _real_extract(self, url):
-        new_url = self._test_redirect(url)
-        if new_url: return [self.url_result(new_url)]
-
-        video_id = url.split('/')[-1]
-        try:
-            webpage = self._download_webpage(url, video_id)
-        except ValueError as err:
-            # since this is the last-resort InfoExtractor, if
-            # this error is thrown, it'll be thrown here
-            raise ExtractorError(u'Invalid URL: %s' % url)
-
-        self.report_extraction(video_id)
-        # Start with something easy: JW Player in SWFObject
-        mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
-        if mobj is None:
-            # Broaden the search a little bit
-            mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
-        if mobj is None:
-            # Broaden the search a little bit: JWPlayer JS loader
-            mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
-        if mobj is None:
-            # Try to find twitter cards info
-            mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
-        if mobj is None:
-            # We look for Open Graph info:
-            # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
-            m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
-            # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
-            if m_video_type is not None:
-                mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-
-        # It's possible that one of the regexes
-        # matched, but returned an empty group:
-        if mobj.group(1) is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-
-        video_url = compat_urllib_parse.unquote(mobj.group(1))
-        video_id = os.path.basename(video_url)
-
-        # here's a fun little line of code for you:
-        video_extension = os.path.splitext(video_id)[1][1:]
-        video_id = os.path.splitext(video_id)[0]
-
-        # it's tempting to parse this further, but you would
-        # have to take into account all the variations like
-        #   Video Title - Site Name
-        #   Site Name | Video Title
-        #   Video Title - Tagline | Site Name
-        # and so on and so forth; it's just not practical
-        video_title = self._html_search_regex(r'<title>(.*)</title>',
-            webpage, u'video title')
-
-        # video uploader is domain name
-        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
-            url, u'video uploader')
-
-        return [{
-            'id':       video_id,
-            'url':      video_url,
-            'uploader': video_uploader,
-            'upload_date':  None,
-            'title':    video_title,
-            'ext':      video_extension,
-        }]
-
-
-
-class GoogleSearchIE(SearchInfoExtractor):
-    """Information Extractor for Google Video search queries."""
-    _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
-    _MAX_RESULTS = 1000
-    IE_NAME = u'video.google:search'
-    _SEARCH_KEY = 'gvsearch'
-
-    def _get_n_results(self, query, n):
-        """Get a specified number of results for a query"""
-
-        res = {
-            '_type': 'playlist',
-            'id': query,
-            'entries': []
-        }
 
-        for pagenum in itertools.count(1):
-            result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
-            webpage = self._download_webpage(result_url, u'gvsearch:' + query,
-                                             note='Downloading result page ' + str(pagenum))
 
-            for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
-                e = {
-                    '_type': 'url',
-                    'url': mobj.group(1)
-                }
-                res['entries'].append(e)
 
-            if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
-                return res
 
 class YahooSearchIE(SearchInfoExtractor):
     """Information Extractor for Yahoo! Video search queries."""
@@ -2910,56 +2738,6 @@ class Vbox7IE(InfoExtractor):
             'thumbnail': thumbnail_url,
         }]
 
-class GametrailersIE(InfoExtractor):
-    _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-        video_id = mobj.group('id')
-        video_type = mobj.group('type')
-        webpage = self._download_webpage(url, video_id)
-        if video_type == 'full-episodes':
-            mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
-        else:
-            mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
-        mgid = self._search_regex(mgid_re, webpage, u'mgid')
-        data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
-
-        info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
-                                           video_id, u'Downloading video info')
-        links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
-                                               video_id, u'Downloading video urls info')
-
-        self.report_extraction(video_id)
-        info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
-                      <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
-                      <image>.*
-                        <url>(?P<thumb>.*?)</url>.*
-                      </image>'''
-
-        m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
-        if m_info is None:
-            raise ExtractorError(u'Unable to extract video info')
-        video_title = m_info.group('title')
-        video_description = m_info.group('description')
-        video_thumb = m_info.group('thumb')
-
-        m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
-        if m_urls is None or len(m_urls) == 0:
-            raise ExtractError(u'Unable to extrat video url')
-        # They are sorted from worst to best quality
-        video_url = m_urls[-1].group('url')
-
-        return {'url':         video_url,
-                'id':          video_id,
-                'title':       video_title,
-                # Videos are actually flv not mp4
-                'ext':         'flv',
-                'thumbnail':   video_thumb,
-                'description': video_description,
-                }
 
 def gen_extractors():
     """ Return a list of an instance of every supported extractor.