X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;ds=inline;f=youtube_dl%2Fextractor%2Fcommon.py;h=9ece3030809502e5288e250987ebad6f4a32d7ca;hb=191b7cbba95679b389a509420993af56ef51545d;hp=da4193734971122c2ef72f3dca4acecd93e5f784;hpb=b6cfde99b7e4ddd16123a1fbc6173c05d691415c;p=youtube-dl
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index da4193734..9ece30308 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1,11 +1,12 @@
import base64
import hashlib
import json
+import netrc
import os
import re
import socket
import sys
-import netrc
+import time
import xml.etree.ElementTree
from ..utils import (
@@ -17,6 +18,7 @@ from ..utils import (
clean_html,
compiled_regex_type,
ExtractorError,
+ int_or_none,
RegexNotFoundError,
sanitize_filename,
unescapeHTML,
@@ -68,6 +70,7 @@ class InfoExtractor(object):
* vcodec Name of the video codec in use
* container Name of the container format
* filesize The number of bytes, if known in advance
+ * filesize_approx An estimate for the number of bytes
* player_url SWF Player URL (used for rtmpdump).
* protocol The protocol that will be used for the actual
download, lower-case.
@@ -81,6 +84,12 @@ class InfoExtractor(object):
format, irrespective of the file format.
-1 for default (order by other properties),
-2 or smaller for less than default.
+ * http_referer HTTP Referer header value to set.
+ * http_method HTTP method to use for the download.
+ * http_headers A dictionary of additional HTTP headers
+ to add to the request.
+ * http_post_data Additional data to send with a POST
+ request.
url: Final video URL.
ext: Video filename extension.
format: The video format, defaults to ext (used for --get-format)
@@ -92,8 +101,12 @@ class InfoExtractor(object):
unique, but available before title. Typically, id is
something like "4234987", title "Dancing naked mole rats",
and display_id "dancing-naked-mole-rats"
- thumbnails: A list of dictionaries (with the entries "resolution" and
- "url") for the varying thumbnails
+ thumbnails: A list of dictionaries, with the following entries:
+ * "url"
+ * "width" (optional, int)
+ * "height" (optional, int)
+ * "resolution" (optional, string "{width}x{height"},
+ deprecated)
thumbnail: Full URL to a video thumbnail image.
description: One-line video description.
uploader: Full name of the video uploader.
@@ -113,6 +126,8 @@ class InfoExtractor(object):
webpage_url: The url to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
by YoutubeDL if it's missing)
+ categories: A list of categories that the video falls in, for example
+ ["Sports", "Berlin"]
Unless mentioned otherwise, the fields should be Unicode strings.
@@ -242,16 +257,20 @@ class InfoExtractor(object):
url = url_or_request.get_full_url()
except AttributeError:
url = url_or_request
- if len(url) > 200:
- h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest()
- url = url[:200 - len(h)] + h
- raw_filename = ('%s_%s.dump' % (video_id, url))
+ basen = '%s_%s' % (video_id, url)
+ if len(basen) > 240:
+ h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
+ basen = basen[:240 - len(h)] + h
+ raw_filename = basen + '.dump'
filename = sanitize_filename(raw_filename, restricted=True)
self.to_screen(u'Saving request to ' + filename)
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
- content = webpage_bytes.decode(encoding, 'replace')
+ try:
+ content = webpage_bytes.decode(encoding, 'replace')
+ except LookupError:
+ content = webpage_bytes.decode('utf-8', 'replace')
if (u'
Access to this site is blocked' in content and
u'Websense' in content[:512]):
@@ -276,9 +295,12 @@ class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id,
note=u'Downloading XML', errnote=u'Unable to download XML',
- transform_source=None):
+ transform_source=None, fatal=True):
"""Return the xml as an xml.etree.ElementTree.Element"""
- xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+ xml_string = self._download_webpage(
+ url_or_request, video_id, note, errnote, fatal=fatal)
+ if xml_string is False:
+ return xml_string
if transform_source:
xml_string = transform_source(xml_string)
return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
@@ -286,8 +308,12 @@ class InfoExtractor(object):
def _download_json(self, url_or_request, video_id,
note=u'Downloading JSON metadata',
errnote=u'Unable to download JSON metadata',
- transform_source=None):
- json_string = self._download_webpage(url_or_request, video_id, note, errnote)
+ transform_source=None,
+ fatal=True):
+ json_string = self._download_webpage(
+ url_or_request, video_id, note, errnote, fatal=fatal)
+ if (not fatal) and json_string is False:
+ return None
if transform_source:
json_string = transform_source(json_string)
try:
@@ -354,7 +380,8 @@ class InfoExtractor(object):
else:
for p in pattern:
mobj = re.search(p, string, flags)
- if mobj: break
+ if mobj:
+ break
if os.name != 'nt' and sys.stderr.isatty():
_name = u'\033[0;34m%s\033[0m' % name
@@ -413,6 +440,22 @@ class InfoExtractor(object):
return (username, password)
+ def _get_tfa_info(self):
+ """
+ Get the two-factor authentication info
+ TODO - asking the user will be required for sms/phone verify
+ currently just uses the command line option
+ If there's no info available, return None
+ """
+ if self._downloader is None:
+ return None
+ downloader_params = self._downloader.params
+
+ if downloader_params.get('twofactor', None) is not None:
+ return downloader_params['twofactor']
+
+ return None
+
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regexes(prop):
@@ -442,18 +485,22 @@ class InfoExtractor(object):
return self._og_search_property('title', html, **kargs)
def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
- regexes = self._og_regexes('video')
- if secure: regexes = self._og_regexes('video:secure_url') + regexes
+ regexes = self._og_regexes('video') + self._og_regexes('video:url')
+ if secure:
+ regexes = self._og_regexes('video:secure_url') + regexes
return self._html_search_regex(regexes, html, name, **kargs)
- def _html_search_meta(self, name, html, display_name=None, fatal=False):
+ def _og_search_url(self, html, **kargs):
+ return self._og_search_property('url', html, **kargs)
+
+ def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
if display_name is None:
display_name = name
return self._html_search_regex(
r'''(?ix)]+(?:itemprop|name|property)=["\']%s["\'])
+ (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
- html, display_name, fatal=fatal)
+ html, display_name, fatal=fatal, **kwargs)
def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader')
@@ -538,10 +585,106 @@ class InfoExtractor(object):
f.get('abr') if f.get('abr') is not None else -1,
audio_ext_preference,
f.get('filesize') if f.get('filesize') is not None else -1,
+ f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
f.get('format_id'),
)
formats.sort(key=_formats_key)
+ def http_scheme(self):
+ """ Either "https:" or "https:", depending on the user's preferences """
+ return (
+ 'http:'
+ if self._downloader.params.get('prefer_insecure', False)
+ else 'https:')
+
+ def _proto_relative_url(self, url, scheme=None):
+ if url is None:
+ return url
+ if url.startswith('//'):
+ if scheme is None:
+ scheme = self.http_scheme()
+ return scheme + url
+ else:
+ return url
+
+ def _sleep(self, timeout, video_id, msg_template=None):
+ if msg_template is None:
+ msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
+ msg = msg_template % {'video_id': video_id, 'timeout': timeout}
+ self.to_screen(msg)
+ time.sleep(timeout)
+
+ def _extract_f4m_formats(self, manifest_url, video_id):
+ manifest = self._download_xml(
+ manifest_url, video_id, 'Downloading f4m manifest',
+ 'Unable to download f4m manifest')
+
+ formats = []
+ media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
+ for i, media_el in enumerate(media_nodes):
+ tbr = int_or_none(media_el.attrib.get('bitrate'))
+ format_id = 'f4m-%d' % (i if tbr is None else tbr)
+ formats.append({
+ 'format_id': format_id,
+ 'url': manifest_url,
+ 'ext': 'flv',
+ 'tbr': tbr,
+ 'width': int_or_none(media_el.attrib.get('width')),
+ 'height': int_or_none(media_el.attrib.get('height')),
+ })
+ self._sort_formats(formats)
+
+ return formats
+
+ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None):
+ formats = [{
+ 'format_id': 'm3u8-meta',
+ 'url': m3u8_url,
+ 'ext': ext,
+ 'protocol': 'm3u8',
+ 'preference': -1,
+ 'resolution': 'multiple',
+ 'format_note': 'Quality selection URL',
+ }]
+
+ m3u8_doc = self._download_webpage(m3u8_url, video_id)
+ last_info = None
+ kv_rex = re.compile(
+ r'(?P[a-zA-Z_-]+)=(?P"[^"]+"|[^",]+)(?:,|$)')
+ for line in m3u8_doc.splitlines():
+ if line.startswith('#EXT-X-STREAM-INF:'):
+ last_info = {}
+ for m in kv_rex.finditer(line):
+ v = m.group('val')
+ if v.startswith('"'):
+ v = v[1:-1]
+ last_info[m.group('key')] = v
+ elif line.startswith('#') or not line.strip():
+ continue
+ else:
+ tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
+
+ f = {
+ 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
+ 'url': line.strip(),
+ 'tbr': tbr,
+ 'ext': ext,
+ }
+ codecs = last_info.get('CODECS')
+ if codecs:
+ video, audio = codecs.split(',')
+ f['vcodec'] = video.partition('.')[0]
+ f['acodec'] = audio.partition('.')[0]
+ resolution = last_info.get('RESOLUTION')
+ if resolution:
+ width_str, height_str = resolution.split('x')
+ f['width'] = int(width_str)
+ f['height'] = int(height_str)
+ formats.append(f)
+ last_info = {}
+ self._sort_formats(formats)
+ return formats
+
class SearchInfoExtractor(InfoExtractor):
"""