X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=f64b88d55c187c7c9dfa9ef874136e3fbb98387c;hb=a130bc6d024e9bfa3c7f8742f8bf5038b2c6e363;hp=da2294a6b89b8e93bb1511babbdc282f2b46969e;hpb=b5809a68bf94e5aa1172e5415be8d7fb91877de8;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index da2294a6b..f64b88d55 100644 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -2,26 +2,17 @@ # -*- coding: utf-8 -*- import datetime -import HTMLParser -import httplib import netrc import os import re import socket import time -import urllib -import urllib2 import email.utils import xml.etree.ElementTree import random import math from urlparse import parse_qs -try: - import cStringIO as StringIO -except ImportError: - import StringIO - from utils import * @@ -29,37 +20,48 @@ class InfoExtractor(object): """Information Extractor class. Information extractors are the classes that, given a URL, extract - information from the video (or videos) the URL refers to. This - information includes the real video URL, the video title and simplified - title, author and others. The information is stored in a dictionary - which is then passed to the FileDownloader. The FileDownloader - processes this information possibly downloading the video to the file - system, among other possible outcomes. The dictionaries must include - the following fields: - - id: Video identifier. - url: Final video URL. - uploader: Nickname of the video uploader. - title: Literal title. - ext: Video filename extension. - format: Video format. - player_url: SWF Player URL (may be None). - - The following fields are optional. Their primary purpose is to allow - youtube-dl to serve as the backend for a video search function, such - as the one in youtube2mp3. They are only used when their respective - forced printing functions are called: - - thumbnail: Full URL to a video thumbnail image. - description: One-line video description. + information about the video (or videos) the URL refers to. This + information includes the real video URL, the video title, author and + others. The information is stored in a dictionary which is then + passed to the FileDownloader. The FileDownloader processes this + information possibly downloading the video to the file system, among + other possible outcomes. + + The dictionaries must include the following fields: + + id: Video identifier. + url: Final video URL. + uploader: Nickname of the video uploader, unescaped. + upload_date: Video upload date (YYYYMMDD). + title: Video title, unescaped. + ext: Video filename extension. + + The following fields are optional: + + format: The video format, defaults to ext (used for --get-format) + thumbnail: Full URL to a video thumbnail image. + description: One-line video description. + player_url: SWF Player URL (used for rtmpdump). + subtitles: The .srt file contents. + urlhandle: [internal] The urlHandle to be used to download the file, + like returned by urllib.request.urlopen + + The fields should all be Unicode strings. Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. + + _real_extract() must return a *list* of information dictionaries as + described above. + + Finally, the _WORKING attribute should be set to False for broken IEs + in order to warn the users and skip the tests. """ _ready = False _downloader = None + _WORKING = True def __init__(self, downloader=None): """Constructor. Receives an optional downloader.""" @@ -70,6 +72,10 @@ class InfoExtractor(object): """Receives a URL and returns True if suitable for this IE.""" return re.match(self._VALID_URL, url) is not None + def working(self): + """Getter method for _WORKING.""" + return self._WORKING + def initialize(self): """Initializes an instance (authentication, etc).""" if not self._ready: @@ -97,7 +103,26 @@ class InfoExtractor(object): class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" - _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' + _VALID_URL = r"""^ + ( + (?:https?://)? # http(s):// (optional) + (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| + tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains + (?:.*?\#/)? # handle anchor (#/) redirect urls + (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs + (?: # the various things that can precede the ID: + (?:(?:v|embed|e)/) # v/ or embed/ or e/ + |(?: # or the v= param in all its forms + (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) + (?:\?|\#!?) # the params delimiter ? or # or #! + (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx) + v= + ) + )? # optional -> youtube.com/xxxx is OK + )? # all until now is optional -> you can pass the naked ID + ([0-9A-Za-z_-]+) # here is it! the YouTube video ID + (?(1).+)? # if we found the ID, everything can follow + $""" _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' @@ -136,6 +161,10 @@ class YoutubeIE(InfoExtractor): } IE_NAME = u'youtube' + def suitable(self, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(self._VALID_URL, url, re.VERBOSE) is not None + def report_lang(self): """Report attempt to set language.""" self._downloader.to_screen(u'[youtube] Setting language') @@ -190,9 +219,9 @@ class YoutubeIE(InfoExtractor): return srt def _print_formats(self, formats): - print 'Available formats:' + print('Available formats:') for x in formats: - print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')) + print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) def _real_initialize(self): if self._downloader is None: @@ -214,17 +243,17 @@ class YoutubeIE(InfoExtractor): password = info[2] else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError), err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err)) + except (IOError, netrc.NetrcParseError) as err: + self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) return # Set language - request = urllib2.Request(self._LANG_URL) + request = compat_urllib_request.Request(self._LANG_URL) try: self.report_lang() - urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err)) + compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err)) return # No authentication to be performed @@ -239,15 +268,15 @@ class YoutubeIE(InfoExtractor): 'username': username, 'password': password, } - request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form)) + request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) try: self.report_login() - login_results = urllib2.urlopen(request).read() + login_results = compat_urllib_request.urlopen(request).read() if re.search(r'(?i)]* name="loginForm"', login_results) is not None: self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') return - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err)) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) return # Confirm age @@ -255,22 +284,22 @@ class YoutubeIE(InfoExtractor): 'next_url': '/', 'action_confirm': 'Confirm', } - request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form)) + request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) try: self.report_age_confirmation() - age_results = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) + age_results = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) return def _real_extract(self, url): # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: - url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/') + url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') # Extract video id from URL - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return @@ -278,11 +307,11 @@ class YoutubeIE(InfoExtractor): # Get video webpage self.report_video_webpage_download(video_id) - request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) + request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) try: - video_webpage = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) + video_webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) return # Attempt to extract SWF player URL @@ -297,14 +326,14 @@ class YoutubeIE(InfoExtractor): for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' % (video_id, el_type)) - request = urllib2.Request(video_info_url) + request = compat_urllib_request.Request(video_info_url) try: - video_info_webpage = urllib2.urlopen(request).read() + video_info_webpage = compat_urllib_request.urlopen(request).read() video_info = parse_qs(video_info_webpage) if 'token' in video_info: break - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err)) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) return if 'token' not in video_info: if 'reason' in video_info: @@ -325,13 +354,13 @@ class YoutubeIE(InfoExtractor): if 'author' not in video_info: self._downloader.trouble(u'ERROR: unable to extract uploader nickname') return - video_uploader = urllib.unquote_plus(video_info['author'][0]) + video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) # title if 'title' not in video_info: self._downloader.trouble(u'ERROR: unable to extract video title') return - video_title = urllib.unquote_plus(video_info['title'][0]) + video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) video_title = video_title.decode('utf-8') # thumbnail image @@ -339,10 +368,10 @@ class YoutubeIE(InfoExtractor): self._downloader.trouble(u'WARNING: unable to extract video thumbnail') video_thumbnail = '' else: # don't panic if we can't find it - video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0]) + video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) # upload date - upload_date = u'NA' + upload_date = None mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) if mobj is not None: upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) @@ -363,11 +392,11 @@ class YoutubeIE(InfoExtractor): if self._downloader.params.get('writesubtitles', False): try: self.report_video_subtitles_download(video_id) - request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: - srt_list = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + srt_list = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err)) srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list) srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list) if not srt_lang_list: @@ -380,19 +409,25 @@ class YoutubeIE(InfoExtractor): srt_lang = srt_lang_list.keys()[0] if not srt_lang in srt_lang_list: raise Trouble(u'WARNING: no closed captions found in the specified language') - request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id)) + request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id)) try: - srt_xml = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + srt_xml = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err)) if not srt_xml: raise Trouble(u'WARNING: unable to download video subtitles') video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) except Trouble as trouble: self._downloader.trouble(trouble[0]) + if 'length_seconds' not in video_info: + self._downloader.trouble(u'WARNING: unable to extract video duration') + video_duration = '' + else: + video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) + # token - video_token = urllib.unquote_plus(video_info['token'][0]) + video_token = compat_urllib_parse.unquote_plus(video_info['token'][0]) # Decide which formats to download req_format = self._downloader.params.get('format', None) @@ -404,7 +439,7 @@ class YoutubeIE(InfoExtractor): url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',') url_data = [parse_qs(uds) for uds in url_data_strs] url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data) - url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data) + url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data) format_limit = self._downloader.params.get('format_limit', None) available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats @@ -446,6 +481,9 @@ class YoutubeIE(InfoExtractor): # Extension video_extension = self._video_extensions.get(format_param, 'flv') + video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'), + self._video_dimensions.get(format_param, '???')) + results.append({ 'id': video_id.decode('utf-8'), 'url': video_real_url.decode('utf-8'), @@ -453,11 +491,12 @@ class YoutubeIE(InfoExtractor): 'upload_date': upload_date, 'title': video_title, 'ext': video_extension.decode('utf-8'), - 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), + 'format': video_format, 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, 'player_url': player_url, - 'subtitles': video_subtitles + 'subtitles': video_subtitles, + 'duration': video_duration }) return results @@ -491,12 +530,12 @@ class MetacafeIE(InfoExtractor): def _real_initialize(self): # Retrieve disclaimer - request = urllib2.Request(self._DISCLAIMER) + request = compat_urllib_request.Request(self._DISCLAIMER) try: self.report_disclaimer() - disclaimer = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err)) + disclaimer = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err)) return # Confirm age @@ -504,12 +543,12 @@ class MetacafeIE(InfoExtractor): 'filters': '0', 'submit': "Continue - I'm over 18", } - request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form)) + request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) try: self.report_age_confirmation() - disclaimer = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) + disclaimer = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) return def _real_extract(self, url): @@ -528,19 +567,19 @@ class MetacafeIE(InfoExtractor): return # Retrieve video webpage to extract further information - request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id) + request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) try: self.report_download_webpage(video_id) - webpage = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err)) + webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader and title from webpage self.report_extraction(video_id) mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) if mobj is not None: - mediaURL = urllib.unquote(mobj.group(1)) + mediaURL = compat_urllib_parse.unquote(mobj.group(1)) video_extension = mediaURL[-3:] # Extract gdaKey if available @@ -573,7 +612,7 @@ class MetacafeIE(InfoExtractor): return video_title = mobj.group(1).decode('utf-8') - mobj = re.search(r'(?ms)By:\s*(.+?)<', webpage) + mobj = re.search(r'submitter=(.*?);', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract uploader nickname') return @@ -583,18 +622,16 @@ class MetacafeIE(InfoExtractor): 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), 'uploader': video_uploader.decode('utf-8'), - 'upload_date': u'NA', + 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, }] class DailymotionIE(InfoExtractor): """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' IE_NAME = u'dailymotion' def __init__(self, downloader=None): @@ -615,36 +652,45 @@ class DailymotionIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return - video_id = mobj.group(1) + video_id = mobj.group(1).split('_')[0].split('?')[0] - video_extension = 'flv' + video_extension = 'mp4' # Retrieve video webpage to extract further information - request = urllib2.Request(url) + request = compat_urllib_request.Request(url) request.add_header('Cookie', 'family_filter=off') try: self.report_download_webpage(video_id) - webpage = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err)) + webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader and title from webpage self.report_extraction(video_id) - mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage) + mobj = re.search(r'\s*var flashvars = (.*)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract media URL') return - sequence = urllib.unquote(mobj.group(1)) - mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence) + flashvars = compat_urllib_parse.unquote(mobj.group(1)) + + for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: + if key in flashvars: + max_quality = key + self._downloader.to_screen(u'[dailymotion] Using %s' % key) + break + else: + self._downloader.trouble(u'ERROR: unable to extract video URL') + return + + mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.trouble(u'ERROR: unable to extract video URL') return - mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '') - # if needed add http://www.dailymotion.com/ if relative URL + video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') - video_url = mediaURL + # TODO: support choosing qualities mobj = re.search(r'', webpage) if mobj is None: @@ -652,21 +698,30 @@ class DailymotionIE(InfoExtractor): return video_title = unescapeHTML(mobj.group('title').decode('utf-8')) - mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) + video_uploader = None + mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract uploader nickname') - return - video_uploader = mobj.group(1) + # lookin for official user + mobj_official = re.search(r'', webpage) + if mobj_official is None: + self._downloader.trouble(u'WARNING: unable to extract uploader nickname') + else: + video_uploader = mobj_official.group(1) + else: + video_uploader = mobj.group(1) + + video_upload_date = None + mobj = re.search(r'
([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) + if mobj is not None: + video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) return [{ 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), 'uploader': video_uploader.decode('utf-8'), - 'upload_date': u'NA', + 'upload_date': video_upload_date, 'title': video_title, 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, }] @@ -699,12 +754,12 @@ class GoogleIE(InfoExtractor): video_extension = 'mp4' # Retrieve video webpage to extract further information - request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id) + request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id) try: self.report_download_webpage(video_id) - webpage = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader, and title from webpage @@ -716,7 +771,7 @@ class GoogleIE(InfoExtractor): if mobj is None: self._downloader.trouble(u'ERROR: unable to extract media URL') return - mediaURL = urllib.unquote(mobj.group(1)) + mediaURL = compat_urllib_parse.unquote(mobj.group(1)) mediaURL = mediaURL.replace('\\x3d', '\x3d') mediaURL = mediaURL.replace('\\x26', '\x26') @@ -739,11 +794,11 @@ class GoogleIE(InfoExtractor): # Extract video thumbnail if self._downloader.params.get('forcethumbnail', False): - request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id))) + request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id))) try: - webpage = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return mobj = re.search(r'', webpage) if mobj is None: @@ -756,12 +811,10 @@ class GoogleIE(InfoExtractor): return [{ 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), - 'uploader': u'NA', - 'upload_date': u'NA', + 'uploader': None, + 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, }] @@ -794,12 +847,12 @@ class PhotobucketIE(InfoExtractor): video_extension = 'flv' # Retrieve video webpage to extract further information - request = urllib2.Request(url) + request = compat_urllib_request.Request(url) try: self.report_download_webpage(video_id) - webpage = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader, and title from webpage @@ -808,7 +861,7 @@ class PhotobucketIE(InfoExtractor): if mobj is None: self._downloader.trouble(u'ERROR: unable to extract media URL') return - mediaURL = urllib.unquote(mobj.group(1)) + mediaURL = compat_urllib_parse.unquote(mobj.group(1)) video_url = mediaURL @@ -824,11 +877,9 @@ class PhotobucketIE(InfoExtractor): 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), 'uploader': video_uploader, - 'upload_date': u'NA', + 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, }] @@ -865,11 +916,11 @@ class YahooIE(InfoExtractor): # Rewrite valid but non-extractable URLs as # extractable English language /watch/ URLs if re.match(self._VPAGE_URL, url) is None: - request = urllib2.Request(url) + request = compat_urllib_request.Request(url) try: - webpage = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return mobj = re.search(r'\("id", "([0-9]+)"\);', webpage) @@ -888,12 +939,12 @@ class YahooIE(InfoExtractor): return self._real_extract(url, new_video=False) # Retrieve video webpage to extract further information - request = urllib2.Request(url) + request = compat_urllib_request.Request(url) try: self.report_download_webpage(video_id) - webpage = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract uploader and title from webpage @@ -944,14 +995,14 @@ class YahooIE(InfoExtractor): # seem to need most of them, otherwise the server sends a 401. yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents yv_bitrate = '700' # according to Wikipedia this is hard-coded - request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id + + request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id + '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height + '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797') try: self.report_download_webpage(video_id) - webpage = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract media URL from playlist XML @@ -959,20 +1010,18 @@ class YahooIE(InfoExtractor): if mobj is None: self._downloader.trouble(u'ERROR: Unable to extract media URL') return - video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8') + video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8') video_url = unescapeHTML(video_url) return [{ 'id': video_id.decode('utf-8'), 'url': video_url, 'uploader': video_uploader, - 'upload_date': u'NA', + 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, - 'thumbnail': video_thumbnail, - 'player_url': None, }] @@ -980,7 +1029,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)' + _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)' IE_NAME = u'vimeo' def __init__(self, downloader=None): @@ -1004,12 +1053,12 @@ class VimeoIE(InfoExtractor): video_id = mobj.group(1) # Retrieve video webpage to extract further information - request = urllib2.Request(url, None, std_headers) + request = compat_urllib_request.Request(url, None, std_headers) try: self.report_download_webpage(video_id) - webpage = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Now we begin extracting as much information as we can from what we @@ -1040,7 +1089,7 @@ class VimeoIE(InfoExtractor): else: video_description = '' # Extract upload date - video_upload_date = u'NA' + video_upload_date = None mobj = re.search(r'', webpage) if mobj is not None: video_upload_date = mobj.group(1) @@ -1050,21 +1099,32 @@ class VimeoIE(InfoExtractor): timestamp = config['request']['timestamp'] # Vimeo specific: extract video codec and quality information + # First consider quality, then codecs, then take everything # TODO bind to format param codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] - for codec in codecs: - if codec[0] in config["video"]["files"]: - video_codec = codec[0] - video_extension = codec[1] - if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd' - else: quality = 'sd' + files = { 'hd': [], 'sd': [], 'other': []} + for codec_name, codec_extension in codecs: + if codec_name in config["video"]["files"]: + if 'hd' in config["video"]["files"][codec_name]: + files['hd'].append((codec_name, codec_extension, 'hd')) + elif 'sd' in config["video"]["files"][codec_name]: + files['sd'].append((codec_name, codec_extension, 'sd')) + else: + files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0])) + + for quality in ('hd', 'sd', 'other'): + if len(files[quality]) > 0: + video_quality = files[quality][0][2] + video_codec = files[quality][0][0] + video_extension = files[quality][0][1] + self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality)) break else: self._downloader.trouble(u'ERROR: no known codec found') return video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, quality, video_codec.upper()) + %(video_id, sig, timestamp, video_quality, video_codec.upper()) return [{ 'id': video_id, @@ -1075,10 +1135,146 @@ class VimeoIE(InfoExtractor): 'ext': video_extension, 'thumbnail': video_thumbnail, 'description': video_description, - 'player_url': None, }] +class ArteTvIE(InfoExtractor): + """arte.tv information extractor.""" + + _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' + _LIVE_URL = r'index-[0-9]+\.html$' + + IE_NAME = u'arte.tv' + + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) + + def report_download_webpage(self, video_id): + """Report webpage download.""" + self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id) + + def fetch_webpage(self, url): + self._downloader.increment_downloads() + request = compat_urllib_request.Request(url) + try: + self.report_download_webpage(url) + webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + return + except ValueError as err: + self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + return + return webpage + + def grep_webpage(self, url, regex, regexFlags, matchTuples): + page = self.fetch_webpage(url) + mobj = re.search(regex, page, regexFlags) + info = {} + + if mobj is None: + self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + return + + for (i, key, err) in matchTuples: + if mobj.group(i) is None: + self._downloader.trouble(err) + return + else: + info[key] = mobj.group(i) + + return info + + def extractLiveStream(self, url): + video_lang = url.split('/')[-4] + info = self.grep_webpage( + url, + r'src="(.*?/videothek_js.*?\.js)', + 0, + [ + (1, 'url', u'ERROR: Invalid URL: %s' % url) + ] + ) + http_host = url.split('/')[2] + next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) + info = self.grep_webpage( + next_url, + r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + + '(http://.*?\.swf).*?' + + '(rtmp://.*?)\'', + re.DOTALL, + [ + (1, 'path', u'ERROR: could not extract video path: %s' % url), + (2, 'player', u'ERROR: could not extract video player: %s' % url), + (3, 'url', u'ERROR: could not extract video url: %s' % url) + ] + ) + video_url = u'%s/%s' % (info.get('url'), info.get('path')) + + def extractPlus7Stream(self, url): + video_lang = url.split('/')[-3] + info = self.grep_webpage( + url, + r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', + 0, + [ + (1, 'url', u'ERROR: Invalid URL: %s' % url) + ] + ) + next_url = compat_urllib_parse.unquote(info.get('url')) + info = self.grep_webpage( + next_url, + r'