DateRange,
encodeFilename,
find_xpath_attr,
+ fix_xml_ampersands,
get_meta_content,
orderedSet,
+ PagedList,
parse_duration,
sanitize_filename,
shell_quote,
self.assertEqual(parse_duration('9:12:43'), 33163)
self.assertEqual(parse_duration('x:y'), None)
+ def test_fix_xml_ampersands(self):
+ self.assertEqual(
+ fix_xml_ampersands('"&x=y&z=a'), '"&x=y&z=a')
+ self.assertEqual(
+ fix_xml_ampersands('"&x=y&wrong;&z=a'),
+ '"&x=y&wrong;&z=a')
+ self.assertEqual(
+ fix_xml_ampersands('&'><"'),
+ '&'><"')
+ self.assertEqual(
+ fix_xml_ampersands('Ӓ᪼'), 'Ӓ᪼')
+ self.assertEqual(fix_xml_ampersands('&#&#'), '&#&#')
+
+ def test_paged_list(self):
+ def testPL(size, pagesize, sliceargs, expected):
+ def get_page(pagenum):
+ firstid = pagenum * pagesize
+ upto = min(size, pagenum * pagesize + pagesize)
+ for i in range(firstid, upto):
+ yield i
+
+ pl = PagedList(get_page, pagesize)
+ got = pl.getslice(*sliceargs)
+ self.assertEqual(got, expected)
+
+ testPL(5, 2, (), [0, 1, 2, 3, 4])
+ testPL(5, 2, (1,), [1, 2, 3, 4])
+ testPL(5, 2, (2,), [2, 3, 4])
+ testPL(5, 2, (4,), [4])
+ testPL(5, 2, (0, 3), [0, 1, 2])
+ testPL(5, 2, (1, 4), [1, 2, 3])
+ testPL(5, 2, (2, 99), [2, 3, 4])
+ testPL(5, 2, (20, 99), [])
+
if __name__ == '__main__':
unittest.main()
locked_file,
make_HTTPS_handler,
MaxDownloadsReached,
+ PagedList,
PostProcessingError,
platform_name,
preferredencoding,
bidi_workaround: Work around buggy terminals without bidirectional text
support, using fridibi
debug_printtraffic:Print out sent and received HTTP traffic
+ include_ads: Download ads as well
+ default_search: Prepend this string if an input url is not valid.
+ 'auto' for elaborate guessing
The following parameters are not used by YoutubeDL itself, they are used by
the FileDownloader:
playlist_results = []
- n_all_entries = len(ie_result['entries'])
playliststart = self.params.get('playliststart', 1) - 1
playlistend = self.params.get('playlistend', None)
# For backwards compatibility, interpret -1 as whole list
if playlistend == -1:
playlistend = None
- entries = ie_result['entries'][playliststart:playlistend]
- n_entries = len(entries)
-
- self.to_screen(
- "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
- (ie_result['extractor'], playlist, n_all_entries, n_entries))
+ if isinstance(ie_result['entries'], list):
+ n_all_entries = len(ie_result['entries'])
+ entries = ie_result['entries'][playliststart:playlistend]
+ n_entries = len(entries)
+ self.to_screen(
+ "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
+ (ie_result['extractor'], playlist, n_all_entries, n_entries))
+ else:
+ assert isinstance(ie_result['entries'], PagedList)
+ entries = ie_result['entries'].getslice(
+ playliststart, playlistend)
+ n_entries = len(entries)
+ self.to_screen(
+ "[%s] playlist %s: Downloading %d videos" %
+ (ie_result['extractor'], playlist, n_entries))
for i, entry in enumerate(entries, 1):
self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
return available_formats[-1]
elif format_spec == 'worst':
return available_formats[0]
+ elif format_spec == 'bestaudio':
+ audio_formats = [
+ f for f in available_formats
+ if f.get('vcodec') == 'none']
+ if audio_formats:
+ return audio_formats[-1]
+ elif format_spec == 'worstaudio':
+ audio_formats = [
+ f for f in available_formats
+ if f.get('vcodec') == 'none']
+ if audio_formats:
+ return audio_formats[0]
else:
extensions = ['mp4', 'flv', 'webm', '3gp']
if format_spec in extensions:
self.list_formats(info_dict)
return
- req_format = self.params.get('format', 'best')
+ req_format = self.params.get('format')
if req_format is None:
req_format = 'best'
formats_to_download = []
res += 'audio'
if fdict.get('abr') is not None:
res += '@%3dk' % fdict['abr']
+ if fdict.get('asr') is not None:
+ res += ' (%5dHz)' % fdict['asr']
if fdict.get('filesize') is not None:
if res:
res += ', '
get_element_by_id,
get_element_by_attribute,
ExtractorError,
+ int_or_none,
+ PagedList,
RegexNotFoundError,
unescapeHTML,
unified_strdate,
u"uploader_id": u"setindia"
}
},
+ {
+ u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
+ u"file": u"a9LDPn-MO4I.m4a",
+ u"note": u"256k DASH audio (format 141) via DASH manifest",
+ u"params": {
+ u"format": "141"
+ },
+ u"info_dict": {
+ u"upload_date": "20121002",
+ u"uploader_id": "8KVIDEO",
+ u"description": "No description available.",
+ u"uploader": "8KVIDEO",
+ u"title": "UHDTV TEST 8K VIDEO.mp4"
+ }
+ },
]
video_id = mobj.group(2)
return video_id
- def _get_video_url_list(self, url_map):
- """
- Transform a dictionary in the format {itag:url} to a list of (itag, url)
- with the requested formats.
- """
- existing_formats = [x for x in self._formats if x in url_map]
- if len(existing_formats) == 0:
- raise ExtractorError(u'no known formats available for video')
- video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
- video_url_list.reverse() # order worst to best
- return video_url_list
-
def _extract_from_m3u8(self, manifest_url, video_id):
url_map = {}
def _get_urls(_manifest):
video_annotations = self._extract_annotations(video_id)
# Decide which formats to download
-
try:
mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
if not mobj:
except ValueError:
pass
+ def _map_to_format_list(urlmap):
+ formats = []
+ for itag, video_real_url in urlmap.items():
+ dct = {
+ 'format_id': itag,
+ 'url': video_real_url,
+ 'player_url': player_url,
+ }
+ dct.update(self._formats[itag])
+ formats.append(dct)
+ return formats
+
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download()
- video_url_list = [('_rtmp', video_info['conn'][0])]
+ formats = [{
+ 'format_id': '_rtmp',
+ 'protocol': 'rtmp',
+ 'url': video_info['conn'][0],
+ 'player_url': player_url,
+ }]
elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
if 'ratebypass' not in url:
url += '&ratebypass=yes'
url_map[url_data['itag'][0]] = url
- video_url_list = self._get_video_url_list(url_map)
+ formats = _map_to_format_list(url_map)
elif video_info.get('hlsvp'):
manifest_url = video_info['hlsvp'][0]
url_map = self._extract_from_m3u8(manifest_url, video_id)
- video_url_list = self._get_video_url_list(url_map)
+ formats = _map_to_format_list(url_map)
else:
raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
- formats = []
- for itag, video_real_url in video_url_list:
- dct = {
- 'format_id': itag,
- 'url': video_real_url,
- 'player_url': player_url,
- }
- dct.update(self._formats[itag])
- formats.append(dct)
+ # Look for the DASH manifest
+ dash_manifest_url_lst = video_info.get('dashmpd')
+ if dash_manifest_url_lst and dash_manifest_url_lst[0]:
+ try:
+ dash_doc = self._download_xml(
+ dash_manifest_url_lst[0], video_id,
+ note=u'Downloading DASH manifest',
+ errnote=u'Could not download DASH manifest')
+ for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
+ url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
+ if url_el is None:
+ continue
+ format_id = r.attrib['id']
+ video_url = url_el.text
+ filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
+ f = {
+ 'format_id': format_id,
+ 'url': video_url,
+ 'width': int_or_none(r.attrib.get('width')),
+ 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
+ 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
+ 'filesize': filesize,
+ }
+ try:
+ existing_format = next(
+ fo for fo in formats
+ if fo['format_id'] == format_id)
+ except StopIteration:
+ f.update(self._formats.get(format_id, {}))
+ formats.append(f)
+ else:
+ existing_format.update(f)
+
+ except (ExtractorError, KeyError) as e:
+ self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
self._sort_formats(formats)
# page by page until there are no video ids - it means we got
# all of them.
- url_results = []
-
- for pagenum in itertools.count(0):
+ def download_page(pagenum):
start_index = pagenum * self._GDATA_PAGE_SIZE + 1
gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
- page = self._download_webpage(gdata_url, username,
- u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
+ page = self._download_webpage(
+ gdata_url, username,
+ u'Downloading video ids from %d to %d' % (
+ start_index, start_index + self._GDATA_PAGE_SIZE))
try:
response = json.loads(page)
except ValueError as err:
raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
if 'entry' not in response['feed']:
- # Number of videos is a multiple of self._MAX_RESULTS
- break
+ return
# Extract video identifiers
entries = response['feed']['entry']
for entry in entries:
title = entry['title']['$t']
video_id = entry['id']['$t'].split('/')[-1]
- url_results.append({
+ yield {
'_type': 'url',
'url': video_id,
'ie_key': 'Youtube',
'id': 'video_id',
'title': title,
- })
-
- # A little optimization - if current page is not
- # "full", ie. does not contain PAGE_SIZE video ids then
- # we can assume that this page is the last one - there
- # are no more ids on further pages - no need to query
- # again.
-
- if len(entries) < self._GDATA_PAGE_SIZE:
- break
+ }
+ url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
return self.playlist_result(url_results, playlist_title=username)
import email.utils
import errno
import gzip
+ import itertools
import io
import json
import locale
def find_xpath_attr(node, xpath, key, val):
""" Find the xpath xpath[@key=val] """
assert re.match(r'^[a-zA-Z]+$', key)
- assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
+ assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
expr = xpath + u"[@%s='%s']" % (key, val)
return node.find(expr)
else:
return None
-def fix_xml_all_ampersand(xml_str):
+def fix_xml_ampersands(xml_str):
"""Replace all the '&' by '&' in XML"""
- return xml_str.replace(u'&', u'&')
+ return re.sub(
+ r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
+ u'&',
+ xml_str)
def setproctitle(title):
return "HEAD"
-def int_or_none(v):
- return v if v is None else int(v)
+def int_or_none(v, scale=1):
+ return v if v is None else (int(v) // scale)
def parse_duration(s):
except OSError:
return False
return exe
+
+
+ class PagedList(object):
+ def __init__(self, pagefunc, pagesize):
+ self._pagefunc = pagefunc
+ self._pagesize = pagesize
+
+ def getslice(self, start=0, end=None):
+ res = []
+ for pagenum in itertools.count(start // self._pagesize):
+ firstid = pagenum * self._pagesize
+ nextfirstid = pagenum * self._pagesize + self._pagesize
+ if start >= nextfirstid:
+ continue
+
+ page_results = list(self._pagefunc(pagenum))
+
+ startv = (
+ start % self._pagesize
+ if firstid <= start < nextfirstid
+ else 0)
+
+ endv = (
+ ((end - 1) % self._pagesize) + 1
+ if (end is not None and firstid <= end <= nextfirstid)
+ else None)
+
+ if startv != 0 or endv is not None:
+ page_results = page_results[startv:endv]
+ res.extend(page_results)
+
+ # A little optimization - if current page is not "full", ie. does
+ # not contain page_size videos then we can assume that this page
+ # is the last one - there are no more ids on further pages -
+ # i.e. no need to query again.
+ if len(page_results) + startv < self._pagesize:
+ break
+
+ # If we got the whole page, but the next page is not interesting,
+ # break out early as well
+ if end == nextfirstid:
+ break
+ return res