X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=cea30dad81fa4224a848732159aa19684c7d5dbc;hb=33d94a6c999ae784be7529aaaea42adadeab0c27;hp=4314f14022b650cb965707ee5c90e07fbecd67c4;hpb=d11d05d07acdd11a93b02d750852dea4ae32be3b;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 4314f1402..cea30dad8 100644 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -12,28 +12,16 @@ import time import urllib import urllib2 import email.utils +import xml.etree.ElementTree +import random +import math +from urlparse import parse_qs try: import cStringIO as StringIO except ImportError: import StringIO -# parse_qs was moved from the cgi module to the urlparse module recently. -try: - from urlparse import parse_qs -except ImportError: - from cgi import parse_qs - -try: - import lxml.etree -except ImportError: - pass # Handled below - -try: - import xml.etree.ElementTree -except ImportError: # Python<2.5: Not officially supported, but let it slip - warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.') - from utils import * @@ -53,7 +41,6 @@ class InfoExtractor(object): url: Final video URL. uploader: Nickname of the video uploader. title: Literal title. - stitle: Simplified title. ext: Video filename extension. format: Video format. player_url: SWF Player URL (may be None). @@ -110,15 +97,34 @@ class InfoExtractor(object): class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" - _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' + _VALID_URL = r"""^ + ( + (?:https?://)? # http(s):// (optional) + (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| + tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains + (?:.*?\#/)? # handle anchor (#/) redirect urls + (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs + (?: # the various things that can precede the ID: + (?:(?:v|embed|e)/) # v/ or embed/ or e/ + |(?: # or the v= param in all its forms + (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) + (?:\?|\#!?) # the params delimiter ? or # or #! + (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx) + v= + ) + )? # optional -> youtube.com/xxxx is OK + )? # all until now is optional -> you can pass the naked ID + ([0-9A-Za-z_-]+) # here is it! the YouTube video ID + (?(1).+)? # if we found the ID, everything can follow + $""" _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _NETRC_MACHINE = 'youtube' # Listed in order of quality - _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] - _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] + _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] + _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] _video_extensions = { '13': '3gp', '17': 'mp4', @@ -129,6 +135,7 @@ class YoutubeIE(InfoExtractor): '43': 'webm', '44': 'webm', '45': 'webm', + '46': 'webm', } _video_dimensions = { '5': '240x400', @@ -144,9 +151,14 @@ class YoutubeIE(InfoExtractor): '43': '360x640', '44': '480x854', '45': '720x1280', + '46': '1080x1920', } IE_NAME = u'youtube' + def suitable(self, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(self._VALID_URL, url, re.VERBOSE) is not None + def report_lang(self): """Report attempt to set language.""" self._downloader.to_screen(u'[youtube] Setting language') @@ -193,17 +205,17 @@ class YoutubeIE(InfoExtractor): end = start + float(dur) start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) - caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) - caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional - srt += str(n) + '\n' + caption = unescapeHTML(caption) + caption = unescapeHTML(caption) # double cycle, intentional + srt += str(n+1) + '\n' srt += start + ' --> ' + end + '\n' srt += caption + '\n\n' return srt def _print_formats(self, formats): - print 'Available formats:' + print('Available formats:') for x in formats: - print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')) + print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) def _real_initialize(self): if self._downloader is None: @@ -226,7 +238,7 @@ class YoutubeIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError), err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err)) + self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) return # Set language @@ -235,7 +247,7 @@ class YoutubeIE(InfoExtractor): self.report_lang() urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err)) + self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err)) return # No authentication to be performed @@ -258,7 +270,7 @@ class YoutubeIE(InfoExtractor): self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') return except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err)) + self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) return # Confirm age @@ -271,7 +283,7 @@ class YoutubeIE(InfoExtractor): self.report_age_confirmation() age_results = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) return def _real_extract(self, url): @@ -281,7 +293,7 @@ class YoutubeIE(InfoExtractor): url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/') # Extract video id from URL - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return @@ -293,7 +305,7 @@ class YoutubeIE(InfoExtractor): try: video_webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) return # Attempt to extract SWF player URL @@ -315,7 +327,7 @@ class YoutubeIE(InfoExtractor): if 'token' in video_info: break except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) return if 'token' not in video_info: if 'reason' in video_info: @@ -324,6 +336,11 @@ class YoutubeIE(InfoExtractor): self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason') return + # Check for "rental" videos + if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: + self._downloader.trouble(u'ERROR: "rental" videos not supported') + return + # Start extracting information self.report_information_extraction(video_id) @@ -339,10 +356,6 @@ class YoutubeIE(InfoExtractor): return video_title = urllib.unquote_plus(video_info['title'][0]) video_title = video_title.decode('utf-8') - video_title = sanitize_title(video_title) - - # simplified title - simple_title = simplify_title(video_title) # thumbnail image if 'thumbnail_url' not in video_info: @@ -364,49 +377,48 @@ class YoutubeIE(InfoExtractor): pass # description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_description = mobj.group(1).decode('utf-8') - else: - html_parser = lxml.etree.HTMLParser(encoding='utf-8') - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) - # TODO use another parser + video_description = get_element_by_id("eow-description", video_webpage.decode('utf8')) + if video_description: video_description = clean_html(video_description) + else: video_description = '' # closed captions video_subtitles = None if self._downloader.params.get('writesubtitles', False): - self.report_video_subtitles_download(video_id) - request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: - srt_list = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) - else: - srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list) - if srt_lang_list: - if self._downloader.params.get('subtitleslang', False): - srt_lang = self._downloader.params.get('subtitleslang') - elif 'en' in srt_lang_list: - srt_lang = 'en' - else: - srt_lang = srt_lang_list[0] - if not srt_lang in srt_lang_list: - self._downloader.trouble(u'WARNING: no closed captions found in the specified language') - else: - request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) - try: - srt_xml = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) - else: - video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) + self.report_video_subtitles_download(video_id) + request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + srt_list = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err)) + srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list) + srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list) + if not srt_lang_list: + raise Trouble(u'WARNING: video has no closed captions') + if self._downloader.params.get('subtitleslang', False): + srt_lang = self._downloader.params.get('subtitleslang') + elif 'en' in srt_lang_list: + srt_lang = 'en' else: - self._downloader.trouble(u'WARNING: video has no closed captions') + srt_lang = srt_lang_list.keys()[0] + if not srt_lang in srt_lang_list: + raise Trouble(u'WARNING: no closed captions found in the specified language') + request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id)) + try: + srt_xml = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err)) + if not srt_xml: + raise Trouble(u'WARNING: unable to download video subtitles') + video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) + except Trouble as trouble: + self._downloader.trouble(trouble[0]) + + if 'length_seconds' not in video_info: + self._downloader.trouble(u'WARNING: unable to extract video duration') + video_duration = '' + else: + video_duration = urllib.unquote_plus(video_info['length_seconds'][0]) # token video_token = urllib.unquote_plus(video_info['token'][0]) @@ -421,7 +433,7 @@ class YoutubeIE(InfoExtractor): url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',') url_data = [parse_qs(uds) for uds in url_data_strs] url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data) - url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data) + url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data) format_limit = self._downloader.params.get('format_limit', None) available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats @@ -469,13 +481,13 @@ class YoutubeIE(InfoExtractor): 'uploader': video_uploader.decode('utf-8'), 'upload_date': upload_date, 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, 'player_url': player_url, - 'subtitles': video_subtitles + 'subtitles': video_subtitles, + 'duration': video_duration }) return results @@ -514,7 +526,7 @@ class MetacafeIE(InfoExtractor): self.report_disclaimer() disclaimer = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err)) return # Confirm age @@ -527,7 +539,7 @@ class MetacafeIE(InfoExtractor): self.report_age_confirmation() disclaimer = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) return def _real_extract(self, url): @@ -545,15 +557,13 @@ class MetacafeIE(InfoExtractor): self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)]) return - simple_title = mobj.group(2).decode('utf-8') - # Retrieve video webpage to extract further information request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id) try: self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader and title from webpage @@ -592,9 +602,8 @@ class MetacafeIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) - mobj = re.search(r'(?ms)By:\s*(.+?)<', webpage) + mobj = re.search(r'submitter=(.*?);', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract uploader nickname') return @@ -606,7 +615,6 @@ class MetacafeIE(InfoExtractor): 'uploader': video_uploader.decode('utf-8'), 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -616,7 +624,7 @@ class MetacafeIE(InfoExtractor): class DailymotionIE(InfoExtractor): """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' IE_NAME = u'dailymotion' def __init__(self, downloader=None): @@ -637,9 +645,9 @@ class DailymotionIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return - video_id = mobj.group(1) + video_id = mobj.group(1).split('_')[0].split('?')[0] - video_extension = 'flv' + video_extension = 'mp4' # Retrieve video webpage to extract further information request = urllib2.Request(url) @@ -648,47 +656,64 @@ class DailymotionIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader and title from webpage self.report_extraction(video_id) - mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage) + mobj = re.search(r'\s*var flashvars = (.*)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract media URL') return - sequence = urllib.unquote(mobj.group(1)) - mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence) + flashvars = urllib.unquote(mobj.group(1)) + + for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: + if key in flashvars: + max_quality = key + self._downloader.to_screen(u'[dailymotion] Using %s' % key) + break + else: + self._downloader.trouble(u'ERROR: unable to extract video URL') + return + + mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.trouble(u'ERROR: unable to extract video URL') return - mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '') - # if needed add http://www.dailymotion.com/ if relative URL + video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/') - video_url = mediaURL + # TODO: support choosing qualities mobj = re.search(r'', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract title') return video_title = unescapeHTML(mobj.group('title').decode('utf-8')) - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) - mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) + video_uploader = u'NA' + mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract uploader nickname') - return - video_uploader = mobj.group(1) + # lookin for official user + mobj_official = re.search(r'', webpage) + if mobj_official is None: + self._downloader.trouble(u'WARNING: unable to extract uploader nickname') + else: + video_uploader = mobj_official.group(1) + else: + video_uploader = mobj.group(1) + + video_upload_date = u'NA' + mobj = re.search(r'
([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) + if mobj is not None: + video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) return [{ 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), 'uploader': video_uploader.decode('utf-8'), - 'upload_date': u'NA', + 'upload_date': video_upload_date, 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -729,7 +754,7 @@ class GoogleIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader, and title from webpage @@ -752,8 +777,6 @@ class GoogleIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) # Extract video description mobj = re.search(r'([^<]*)', webpage) @@ -770,7 +793,7 @@ class GoogleIE(InfoExtractor): try: webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return mobj = re.search(r'', webpage) if mobj is None: @@ -786,7 +809,6 @@ class GoogleIE(InfoExtractor): 'uploader': u'NA', 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -827,7 +849,7 @@ class PhotobucketIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader, and title from webpage @@ -845,8 +867,6 @@ class PhotobucketIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) video_uploader = mobj.group(2).decode('utf-8') @@ -856,7 +876,6 @@ class PhotobucketIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -900,7 +919,7 @@ class YahooIE(InfoExtractor): try: webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return mobj = re.search(r'\("id", "([0-9]+)"\);', webpage) @@ -924,7 +943,7 @@ class YahooIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract uploader and title from webpage @@ -934,7 +953,6 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract video title') return video_title = mobj.group(1).decode('utf-8') - simple_title = simplify_title(video_title) mobj = re.search(r'

(.*)

', webpage) if mobj is None: @@ -983,7 +1001,7 @@ class YahooIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract media URL from playlist XML @@ -992,7 +1010,7 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'ERROR: Unable to extract media URL') return video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8') - video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url) + video_url = unescapeHTML(video_url) return [{ 'id': video_id.decode('utf-8'), @@ -1000,7 +1018,6 @@ class YahooIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, @@ -1013,7 +1030,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)' + _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)' IE_NAME = u'vimeo' def __init__(self, downloader=None): @@ -1042,7 +1059,7 @@ class VimeoIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Now we begin extracting as much information as we can from what we @@ -1060,7 +1077,6 @@ class VimeoIE(InfoExtractor): # Extract title video_title = config["video"]["title"] - simple_title = simplify_title(video_title) # Extract uploader video_uploader = config["video"]["owner"]["name"] @@ -1069,18 +1085,9 @@ class VimeoIE(InfoExtractor): video_thumbnail = config["video"]["thumbnail"] # Extract video description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'', webpage, re.MULTILINE) - if mobj is not None: - video_description = mobj.group(1) - else: - html_parser = lxml.etree.HTMLParser() - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() - # TODO use another parser + video_description = get_element_by_id("description", webpage.decode('utf8')) + if video_description: video_description = clean_html(video_description) + else: video_description = '' # Extract upload date video_upload_date = u'NA' @@ -1093,21 +1100,32 @@ class VimeoIE(InfoExtractor): timestamp = config['request']['timestamp'] # Vimeo specific: extract video codec and quality information + # First consider quality, then codecs, then take everything # TODO bind to format param codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] - for codec in codecs: - if codec[0] in config["video"]["files"]: - video_codec = codec[0] - video_extension = codec[1] - if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd' - else: quality = 'sd' + files = { 'hd': [], 'sd': [], 'other': []} + for codec_name, codec_extension in codecs: + if codec_name in config["video"]["files"]: + if 'hd' in config["video"]["files"][codec_name]: + files['hd'].append((codec_name, codec_extension, 'hd')) + elif 'sd' in config["video"]["files"][codec_name]: + files['sd'].append((codec_name, codec_extension, 'sd')) + else: + files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0])) + + for quality in ('hd', 'sd', 'other'): + if len(files[quality]) > 0: + video_quality = files[quality][0][2] + video_codec = files[quality][0][0] + video_extension = files[quality][0][1] + self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality)) break else: self._downloader.trouble(u'ERROR: no known codec found') return video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, quality, video_codec.upper()) + %(video_id, sig, timestamp, video_quality, video_codec.upper()) return [{ 'id': video_id, @@ -1115,7 +1133,6 @@ class VimeoIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': video_upload_date, 'title': video_title, - 'stitle': simple_title, 'ext': video_extension, 'thumbnail': video_thumbnail, 'description': video_description, @@ -1123,6 +1140,143 @@ class VimeoIE(InfoExtractor): }] +class ArteTvIE(InfoExtractor): + """arte.tv information extractor.""" + + _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' + _LIVE_URL = r'index-[0-9]+\.html$' + + IE_NAME = u'arte.tv' + + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) + + def report_download_webpage(self, video_id): + """Report webpage download.""" + self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id) + + def fetch_webpage(self, url): + self._downloader.increment_downloads() + request = urllib2.Request(url) + try: + self.report_download_webpage(url) + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + return + except ValueError, err: + self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + return + return webpage + + def grep_webpage(self, url, regex, regexFlags, matchTuples): + page = self.fetch_webpage(url) + mobj = re.search(regex, page, regexFlags) + info = {} + + if mobj is None: + self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + return + + for (i, key, err) in matchTuples: + if mobj.group(i) is None: + self._downloader.trouble(err) + return + else: + info[key] = mobj.group(i) + + return info + + def extractLiveStream(self, url): + video_lang = url.split('/')[-4] + info = self.grep_webpage( + url, + r'src="(.*?/videothek_js.*?\.js)', + 0, + [ + (1, 'url', u'ERROR: Invalid URL: %s' % url) + ] + ) + http_host = url.split('/')[2] + next_url = 'http://%s%s' % (http_host, urllib.unquote(info.get('url'))) + info = self.grep_webpage( + next_url, + r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + + '(http://.*?\.swf).*?' + + '(rtmp://.*?)\'', + re.DOTALL, + [ + (1, 'path', u'ERROR: could not extract video path: %s' % url), + (2, 'player', u'ERROR: could not extract video player: %s' % url), + (3, 'url', u'ERROR: could not extract video url: %s' % url) + ] + ) + video_url = u'%s/%s' % (info.get('url'), info.get('path')) + + def extractPlus7Stream(self, url): + video_lang = url.split('/')[-3] + info = self.grep_webpage( + url, + r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', + 0, + [ + (1, 'url', u'ERROR: Invalid URL: %s' % url) + ] + ) + next_url = urllib.unquote(info.get('url')) + info = self.grep_webpage( + next_url, + r'