X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=9ece3030809502e5288e250987ebad6f4a32d7ca;hb=191b7cbba95679b389a509420993af56ef51545d;hp=db472aace8faabb465e9c93b7ff6013ccece4e8e;hpb=37e3cbe22e0bfa6b98a6343be88e1c8c2c7ac41f;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index db472aace..9ece30308 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,11 +1,12 @@ import base64 import hashlib import json +import netrc import os import re import socket import sys -import netrc +import time import xml.etree.ElementTree from ..utils import ( @@ -17,6 +18,7 @@ from ..utils import ( clean_html, compiled_regex_type, ExtractorError, + int_or_none, RegexNotFoundError, sanitize_filename, unescapeHTML, @@ -68,6 +70,7 @@ class InfoExtractor(object): * vcodec Name of the video codec in use * container Name of the container format * filesize The number of bytes, if known in advance + * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. @@ -81,6 +84,12 @@ class InfoExtractor(object): format, irrespective of the file format. -1 for default (order by other properties), -2 or smaller for less than default. + * http_referer HTTP Referer header value to set. + * http_method HTTP method to use for the download. + * http_headers A dictionary of additional HTTP headers + to add to the request. + * http_post_data Additional data to send with a POST + request. url: Final video URL. ext: Video filename extension. format: The video format, defaults to ext (used for --get-format) @@ -92,8 +101,12 @@ class InfoExtractor(object): unique, but available before title. Typically, id is something like "4234987", title "Dancing naked mole rats", and display_id "dancing-naked-mole-rats" - thumbnails: A list of dictionaries (with the entries "resolution" and - "url") for the varying thumbnails + thumbnails: A list of dictionaries, with the following entries: + * "url" + * "width" (optional, int) + * "height" (optional, int) + * "resolution" (optional, string "{width}x{height"}, + deprecated) thumbnail: Full URL to a video thumbnail image. description: One-line video description. uploader: Full name of the video uploader. @@ -295,8 +308,12 @@ class InfoExtractor(object): def _download_json(self, url_or_request, video_id, note=u'Downloading JSON metadata', errnote=u'Unable to download JSON metadata', - transform_source=None): - json_string = self._download_webpage(url_or_request, video_id, note, errnote) + transform_source=None, + fatal=True): + json_string = self._download_webpage( + url_or_request, video_id, note, errnote, fatal=fatal) + if (not fatal) and json_string is False: + return None if transform_source: json_string = transform_source(json_string) try: @@ -363,7 +380,8 @@ class InfoExtractor(object): else: for p in pattern: mobj = re.search(p, string, flags) - if mobj: break + if mobj: + break if os.name != 'nt' and sys.stderr.isatty(): _name = u'\033[0;34m%s\033[0m' % name @@ -422,6 +440,22 @@ class InfoExtractor(object): return (username, password) + def _get_tfa_info(self): + """ + Get the two-factor authentication info + TODO - asking the user will be required for sms/phone verify + currently just uses the command line option + If there's no info available, return None + """ + if self._downloader is None: + return None + downloader_params = self._downloader.params + + if downloader_params.get('twofactor', None) is not None: + return downloader_params['twofactor'] + + return None + # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): @@ -451,18 +485,22 @@ class InfoExtractor(object): return self._og_search_property('title', html, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): - regexes = self._og_regexes('video') - if secure: regexes = self._og_regexes('video:secure_url') + regexes + regexes = self._og_regexes('video') + self._og_regexes('video:url') + if secure: + regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) - def _html_search_meta(self, name, html, display_name=None, fatal=False): + def _og_search_url(self, html, **kargs): + return self._og_search_property('url', html, **kargs) + + def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): if display_name is None: display_name = name return self._html_search_regex( r'''(?ix)]+(?:itemprop|name|property)=["\']%s["\']) + (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?) [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), - html, display_name, fatal=fatal) + html, display_name, fatal=fatal, **kwargs) def _dc_search_uploader(self, html): return self._html_search_meta('dc.creator', html, 'uploader') @@ -547,6 +585,7 @@ class InfoExtractor(object): f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, f.get('filesize') if f.get('filesize') is not None else -1, + f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, f.get('format_id'), ) formats.sort(key=_formats_key) @@ -568,6 +607,84 @@ class InfoExtractor(object): else: return url + def _sleep(self, timeout, video_id, msg_template=None): + if msg_template is None: + msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds' + msg = msg_template % {'video_id': video_id, 'timeout': timeout} + self.to_screen(msg) + time.sleep(timeout) + + def _extract_f4m_formats(self, manifest_url, video_id): + manifest = self._download_xml( + manifest_url, video_id, 'Downloading f4m manifest', + 'Unable to download f4m manifest') + + formats = [] + media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') + for i, media_el in enumerate(media_nodes): + tbr = int_or_none(media_el.attrib.get('bitrate')) + format_id = 'f4m-%d' % (i if tbr is None else tbr) + formats.append({ + 'format_id': format_id, + 'url': manifest_url, + 'ext': 'flv', + 'tbr': tbr, + 'width': int_or_none(media_el.attrib.get('width')), + 'height': int_or_none(media_el.attrib.get('height')), + }) + self._sort_formats(formats) + + return formats + + def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None): + formats = [{ + 'format_id': 'm3u8-meta', + 'url': m3u8_url, + 'ext': ext, + 'protocol': 'm3u8', + 'preference': -1, + 'resolution': 'multiple', + 'format_note': 'Quality selection URL', + }] + + m3u8_doc = self._download_webpage(m3u8_url, video_id) + last_info = None + kv_rex = re.compile( + r'(?P[a-zA-Z_-]+)=(?P"[^"]+"|[^",]+)(?:,|$)') + for line in m3u8_doc.splitlines(): + if line.startswith('#EXT-X-STREAM-INF:'): + last_info = {} + for m in kv_rex.finditer(line): + v = m.group('val') + if v.startswith('"'): + v = v[1:-1] + last_info[m.group('key')] = v + elif line.startswith('#') or not line.strip(): + continue + else: + tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) + + f = { + 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)), + 'url': line.strip(), + 'tbr': tbr, + 'ext': ext, + } + codecs = last_info.get('CODECS') + if codecs: + video, audio = codecs.split(',') + f['vcodec'] = video.partition('.')[0] + f['acodec'] = audio.partition('.')[0] + resolution = last_info.get('RESOLUTION') + if resolution: + width_str, height_str = resolution.split('x') + f['width'] = int(width_str) + f['height'] = int(height_str) + formats.append(f) + last_info = {} + self._sort_formats(formats) + return formats + class SearchInfoExtractor(InfoExtractor): """ @@ -611,4 +728,3 @@ class SearchInfoExtractor(InfoExtractor): @property def SEARCH_KEY(self): return self._SEARCH_KEY -