-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from __future__ import absolute_import
-
import base64
import datetime
import itertools
import urllib
from .utils import *
-
-
from .extractor.common import InfoExtractor, SearchInfoExtractor
+
+from .extractor.ard import ARDIE
+from .extractor.arte import ArteTvIE
from .extractor.dailymotion import DailymotionIE
+from .extractor.gametrailers import GametrailersIE
+from .extractor.generic import GenericIE
from .extractor.metacafe import MetacafeIE
from .extractor.statigram import StatigramIE
from .extractor.photobucket import PhotobucketIE
from .extractor.vimeo import VimeoIE
from .extractor.yahoo import YahooIE
-from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
-
-
-
-
+from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
+from .extractor.zdf import ZDFIE
-class ArteTvIE(InfoExtractor):
- """arte.tv information extractor."""
-
- _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
- _LIVE_URL = r'index-[0-9]+\.html$'
-
- IE_NAME = u'arte.tv'
-
- def fetch_webpage(self, url):
- request = compat_urllib_request.Request(url)
- try:
- self.report_download_webpage(url)
- webpage = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
- except ValueError as err:
- raise ExtractorError(u'Invalid URL: %s' % url)
- return webpage
-
- def grep_webpage(self, url, regex, regexFlags, matchTuples):
- page = self.fetch_webpage(url)
- mobj = re.search(regex, page, regexFlags)
- info = {}
-
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- for (i, key, err) in matchTuples:
- if mobj.group(i) is None:
- raise ExtractorError(err)
- else:
- info[key] = mobj.group(i)
-
- return info
- def extractLiveStream(self, url):
- video_lang = url.split('/')[-4]
- info = self.grep_webpage(
- url,
- r'src="(.*?/videothek_js.*?\.js)',
- 0,
- [
- (1, 'url', u'Invalid URL: %s' % url)
- ]
- )
- http_host = url.split('/')[2]
- next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
- info = self.grep_webpage(
- next_url,
- r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
- '(http://.*?\.swf).*?' +
- '(rtmp://.*?)\'',
- re.DOTALL,
- [
- (1, 'path', u'could not extract video path: %s' % url),
- (2, 'player', u'could not extract video player: %s' % url),
- (3, 'url', u'could not extract video url: %s' % url)
- ]
- )
- video_url = u'%s/%s' % (info.get('url'), info.get('path'))
-
- def extractPlus7Stream(self, url):
- video_lang = url.split('/')[-3]
- info = self.grep_webpage(
- url,
- r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
- 0,
- [
- (1, 'url', u'Invalid URL: %s' % url)
- ]
- )
- next_url = compat_urllib_parse.unquote(info.get('url'))
- info = self.grep_webpage(
- next_url,
- r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
- 0,
- [
- (1, 'url', u'Could not find <video> tag: %s' % url)
- ]
- )
- next_url = compat_urllib_parse.unquote(info.get('url'))
-
- info = self.grep_webpage(
- next_url,
- r'<video id="(.*?)".*?>.*?' +
- '<name>(.*?)</name>.*?' +
- '<dateVideo>(.*?)</dateVideo>.*?' +
- '<url quality="hd">(.*?)</url>',
- re.DOTALL,
- [
- (1, 'id', u'could not extract video id: %s' % url),
- (2, 'title', u'could not extract video title: %s' % url),
- (3, 'date', u'could not extract video date: %s' % url),
- (4, 'url', u'could not extract video url: %s' % url)
- ]
- )
- return {
- 'id': info.get('id'),
- 'url': compat_urllib_parse.unquote(info.get('url')),
- 'uploader': u'arte.tv',
- 'upload_date': unified_strdate(info.get('date')),
- 'title': info.get('title').decode('utf-8'),
- 'ext': u'mp4',
- 'format': u'NA',
- 'player_url': None,
- }
- def _real_extract(self, url):
- video_id = url.split('/')[-1]
- self.report_extraction(video_id)
- if re.search(self._LIVE_URL, video_id) is not None:
- self.extractLiveStream(url)
- return
- else:
- info = self.extractPlus7Stream(url)
- return [info]
-class GenericIE(InfoExtractor):
- """Generic last-resort information extractor."""
- _VALID_URL = r'.*'
- IE_NAME = u'generic'
- def report_download_webpage(self, video_id):
- """Report webpage download."""
- if not self._downloader.params.get('test', False):
- self._downloader.report_warning(u'Falling back on generic information extractor.')
- super(GenericIE, self).report_download_webpage(video_id)
-
- def report_following_redirect(self, new_url):
- """Report information extraction."""
- self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
-
- def _test_redirect(self, url):
- """Check if it is a redirect, like url shorteners, in case return the new url."""
- class HeadRequest(compat_urllib_request.Request):
- def get_method(self):
- return "HEAD"
-
- class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
- """
- Subclass the HTTPRedirectHandler to make it use our
- HeadRequest also on the redirected URL
- """
- def redirect_request(self, req, fp, code, msg, headers, newurl):
- if code in (301, 302, 303, 307):
- newurl = newurl.replace(' ', '%20')
- newheaders = dict((k,v) for k,v in req.headers.items()
- if k.lower() not in ("content-length", "content-type"))
- return HeadRequest(newurl,
- headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True)
- else:
- raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
-
- class HTTPMethodFallback(compat_urllib_request.BaseHandler):
- """
- Fallback to GET if HEAD is not allowed (405 HTTP error)
- """
- def http_error_405(self, req, fp, code, msg, headers):
- fp.read()
- fp.close()
-
- newheaders = dict((k,v) for k,v in req.headers.items()
- if k.lower() not in ("content-length", "content-type"))
- return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
- headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True))
-
- # Build our opener
- opener = compat_urllib_request.OpenerDirector()
- for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
- HTTPMethodFallback, HEADRedirectHandler,
- compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
- opener.add_handler(handler())
-
- response = opener.open(HeadRequest(url))
- if response is None:
- raise ExtractorError(u'Invalid URL protocol')
- new_url = response.geturl()
-
- if url == new_url:
- return False
-
- self.report_following_redirect(new_url)
- return new_url
-
- def _real_extract(self, url):
- new_url = self._test_redirect(url)
- if new_url: return [self.url_result(new_url)]
-
- video_id = url.split('/')[-1]
- try:
- webpage = self._download_webpage(url, video_id)
- except ValueError as err:
- # since this is the last-resort InfoExtractor, if
- # this error is thrown, it'll be thrown here
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- self.report_extraction(video_id)
- # Start with something easy: JW Player in SWFObject
- mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
- if mobj is None:
- # Broaden the search a little bit
- mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
- if mobj is None:
- # Broaden the search a little bit: JWPlayer JS loader
- mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
- if mobj is None:
- # Try to find twitter cards info
- mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
- if mobj is None:
- # We look for Open Graph info:
- # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
- m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
- # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
- if m_video_type is not None:
- mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- # It's possible that one of the regexes
- # matched, but returned an empty group:
- if mobj.group(1) is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- video_url = compat_urllib_parse.unquote(mobj.group(1))
- video_id = os.path.basename(video_url)
-
- # here's a fun little line of code for you:
- video_extension = os.path.splitext(video_id)[1][1:]
- video_id = os.path.splitext(video_id)[0]
-
- # it's tempting to parse this further, but you would
- # have to take into account all the variations like
- # Video Title - Site Name
- # Site Name | Video Title
- # Video Title - Tagline | Site Name
- # and so on and so forth; it's just not practical
- video_title = self._html_search_regex(r'<title>(.*)</title>',
- webpage, u'video title')
-
- # video uploader is domain name
- video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
- url, u'video uploader')
-
- return [{
- 'id': video_id,
- 'url': video_url,
- 'uploader': video_uploader,
- 'upload_date': None,
- 'title': video_title,
- 'ext': video_extension,
- }]
-
-
-class YoutubeSearchIE(SearchInfoExtractor):
- """Information Extractor for YouTube search queries."""
- _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
- _MAX_RESULTS = 1000
- IE_NAME = u'youtube:search'
- _SEARCH_KEY = 'ytsearch'
-
- def report_download_page(self, query, pagenum):
- """Report attempt to download search page with given number."""
- self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
-
- def _get_n_results(self, query, n):
- """Get a specified number of results for a query"""
-
- video_ids = []
- pagenum = 0
- limit = n
-
- while (50 * pagenum) < limit:
- self.report_download_page(query, pagenum+1)
- result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
- request = compat_urllib_request.Request(result_url)
- try:
- data = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
- api_response = json.loads(data)['data']
-
- if not 'items' in api_response:
- raise ExtractorError(u'[youtube] No video results')
-
- new_ids = list(video['id'] for video in api_response['items'])
- video_ids += new_ids
-
- limit = min(n, api_response['totalItems'])
- pagenum += 1
-
- if len(video_ids) > n:
- video_ids = video_ids[:n]
- videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
- return self.playlist_result(videos, query)
-
-
-class GoogleSearchIE(SearchInfoExtractor):
- """Information Extractor for Google Video search queries."""
- _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
- _MAX_RESULTS = 1000
- IE_NAME = u'video.google:search'
- _SEARCH_KEY = 'gvsearch'
-
- def _get_n_results(self, query, n):
- """Get a specified number of results for a query"""
-
- res = {
- '_type': 'playlist',
- 'id': query,
- 'entries': []
- }
-
- for pagenum in itertools.count(1):
- result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
- webpage = self._download_webpage(result_url, u'gvsearch:' + query,
- note='Downloading result page ' + str(pagenum))
-
- for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
- e = {
- '_type': 'url',
- 'url': mobj.group(1)
- }
- res['entries'].append(e)
-
- if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
- return res
class YahooSearchIE(SearchInfoExtractor):
"""Information Extractor for Yahoo! Video search queries."""
return [info]
-class ARDIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
- _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
- _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
-
- def _real_extract(self, url):
- # determine video id from url
- m = re.match(self._VALID_URL, url)
-
- numid = re.search(r'documentId=([0-9]+)', url)
- if numid:
- video_id = numid.group(1)
- else:
- video_id = m.group('video_id')
-
- # determine title and media streams from webpage
- html = self._download_webpage(url, video_id)
- title = re.search(self._TITLE, html).group('title')
- streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
- if not streams:
- assert '"fsk"' in html
- raise ExtractorError(u'This video is only available after 8:00 pm')
-
- # choose default media type and highest quality for now
- stream = max([s for s in streams if int(s["media_type"]) == 0],
- key=lambda s: int(s["quality"]))
-
- # there's two possibilities: RTMP stream or HTTP download
- info = {'id': video_id, 'title': title, 'ext': 'mp4'}
- if stream['rtmp_url']:
- self.to_screen(u'RTMP download detected')
- assert stream['video_url'].startswith('mp4:')
- info["url"] = stream["rtmp_url"]
- info["play_path"] = stream['video_url']
- else:
- assert stream["video_url"].endswith('.mp4')
- info["url"] = stream["video_url"]
- return [info]
-
-class ZDFIE(InfoExtractor):
- _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
- _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
- _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
- _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
- _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- video_id = mobj.group('video_id')
-
- html = self._download_webpage(url, video_id)
- streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
- if streams is None:
- raise ExtractorError(u'No media url found.')
-
- # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
- # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
- # choose first/default media type and highest quality for now
- for s in streams: #find 300 - dsl1000mbit
- if s['quality'] == '300' and s['media_type'] == 'wstreaming':
- stream_=s
- break
- for s in streams: #find veryhigh - dsl2000mbit
- if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
- stream_=s
- break
- if stream_ is None:
- raise ExtractorError(u'No stream found.')
-
- media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
- self.report_extraction(video_id)
- mobj = re.search(self._TITLE, html)
- if mobj is None:
- raise ExtractorError(u'Cannot extract title')
- title = unescapeHTML(mobj.group('title'))
-
- mobj = re.search(self._MMS_STREAM, media_link)
- if mobj is None:
- mobj = re.search(self._RTSP_STREAM, media_link)
- if mobj is None:
- raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
- mms_url = mobj.group('video_url')
-
- mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
- if mobj is None:
- raise ExtractorError(u'Cannot extract extention')
- ext = mobj.group('ext')
-
- return [{'id': video_id,
- 'url': mms_url,
- 'title': title,
- 'ext': ext
- }]
class TumblrIE(InfoExtractor):
_VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
'thumbnail': thumbnail_url,
}]
-class GametrailersIE(InfoExtractor):
- _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- video_id = mobj.group('id')
- video_type = mobj.group('type')
- webpage = self._download_webpage(url, video_id)
- if video_type == 'full-episodes':
- mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
- else:
- mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
- mgid = self._search_regex(mgid_re, webpage, u'mgid')
- data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
-
- info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
- video_id, u'Downloading video info')
- links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
- video_id, u'Downloading video urls info')
-
- self.report_extraction(video_id)
- info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
- <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
- <image>.*
- <url>(?P<thumb>.*?)</url>.*
- </image>'''
-
- m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
- if m_info is None:
- raise ExtractorError(u'Unable to extract video info')
- video_title = m_info.group('title')
- video_description = m_info.group('description')
- video_thumb = m_info.group('thumb')
-
- m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
- if m_urls is None or len(m_urls) == 0:
- raise ExtractError(u'Unable to extrat video url')
- # They are sorted from worst to best quality
- video_url = m_urls[-1].group('url')
-
- return {'url': video_url,
- 'id': video_id,
- 'title': video_title,
- # Videos are actually flv not mp4
- 'ext': 'flv',
- 'thumbnail': video_thumb,
- 'description': video_description,
- }
def gen_extractors():
""" Return a list of an instance of every supported extractor.