X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=52523d7b249210eed88d334c2fefe28cabe78706;hb=36e6f62cd0883f0f486d1666d010e5d9e6d515bd;hp=d694e818e98dc29939118a6d9fccb8b942b03128;hpb=586f1cc532d167c28e733779cbf132b94d8f76e6;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d694e818e..52523d7b2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,13 +10,11 @@ import re import socket import sys import time -import xml.etree.ElementTree from ..compat import ( compat_cookiejar, compat_cookies, compat_getpass, - compat_HTTPError, compat_http_client, compat_urllib_error, compat_urllib_parse, @@ -24,6 +22,7 @@ from ..compat import ( compat_urllib_request, compat_urlparse, compat_str, + compat_etree_fromstring, ) from ..utils import ( NO_DEFAULT, @@ -39,6 +38,7 @@ from ..utils import ( RegexNotFoundError, sanitize_filename, unescapeHTML, + unified_strdate, url_basename, xpath_text, xpath_with_ns, @@ -152,6 +152,7 @@ class InfoExtractor(object): description: Full video description. uploader: Full name of the video uploader. creator: The main artist who created the video. + release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. @@ -163,12 +164,14 @@ class InfoExtractor(object): with the "ext" entry and one of: * "data": The subtitles file contents * "url": A URL pointing to the subtitles file + "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video average_rating: Average rating give by users, the scale used depends on the webpage comment_count: Number of comments on the video comments: A list of comments, each with one or more of the following @@ -458,7 +461,7 @@ class InfoExtractor(object): return xml_string if transform_source: xml_string = transform_source(xml_string) - return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + return compat_etree_fromstring(xml_string.encode('utf-8')) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', @@ -516,6 +519,12 @@ class InfoExtractor(object): '%s. Use --username and --password or --netrc to provide account credentials.' % msg, expected=True) + @staticmethod + def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): + raise ExtractorError( + '%s. You might want to use --proxy to workaround.' % msg, + expected=True) + # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None, video_title=None): @@ -636,8 +645,9 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' - property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -831,7 +841,7 @@ class InfoExtractor(object): self._request_webpage(url, video_id, 'Checking %s URL' % item) return True except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, compat_urllib_error.URLError): self.to_screen( '%s: %s URL is invalid, skipping' % (video_id, item)) return False @@ -862,13 +872,18 @@ class InfoExtractor(object): time.sleep(timeout) def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip()): + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) - transform_source=transform_source) + transform_source=transform_source, + fatal=fatal) + + if manifest is False: + return manifest formats = [] manifest_version = '1.0' @@ -889,7 +904,10 @@ class InfoExtractor(object): # may differ leading to inability to resolve the format by requested # bitrate in f4m downloader if determine_ext(manifest_url) == 'f4m': - formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id)) + f4m_formats = self._extract_f4m_formats( + manifest_url, video_id, preference, f4m_id, fatal=fatal) + if f4m_formats: + formats.extend(f4m_formats) continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ @@ -925,13 +943,14 @@ class InfoExtractor(object): if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc = self._download_webpage( + m3u8_doc, urlh = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) if m3u8_doc is False: return m3u8_doc + m3u8_url = urlh.geturl() last_info = None last_media = None kv_rex = re.compile( @@ -1037,6 +1056,7 @@ class InfoExtractor(object): video_id = os.path.splitext(url_basename(smil_url))[0] title = None description = None + upload_date = None for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): name = meta.attrib.get('name') content = meta.attrib.get('content') @@ -1046,11 +1066,22 @@ class InfoExtractor(object): title = content elif not description and name in ('description', 'abstract'): description = content + elif not upload_date and name == 'date': + upload_date = unified_strdate(content) + + thumbnails = [{ + 'id': image.get('type'), + 'url': image.get('src'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] return { 'id': video_id, 'title': title or video_id, 'description': description, + 'upload_date': upload_date, + 'thumbnails': thumbnails, 'formats': formats, 'subtitles': subtitles, } @@ -1077,7 +1108,7 @@ class InfoExtractor(object): if not src: continue - bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) filesize = int_or_none(video.get('size') or video.get('fileSize')) width = int_or_none(video.get('width')) height = int_or_none(video.get('height')) @@ -1109,8 +1140,10 @@ class InfoExtractor(object): src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) if proto == 'm3u8' or src_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) continue if src_ext == 'f4m': @@ -1122,10 +1155,12 @@ class InfoExtractor(object): } f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse.urlencode(f4m_params) - formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) continue - if src_url.startswith('http'): + if src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url,