From: Philipp Hagemeister Date: Wed, 22 Jan 2014 19:00:16 +0000 (+0100) Subject: Merge branch 'paged-lists' X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=65697b3bf3bf6eaeb91a34e5308a6d2239118071;hp=-c;p=youtube-dl Merge branch 'paged-lists' Conflicts: test/test_utils.py youtube_dl/extractor/youtube.py --- 65697b3bf3bf6eaeb91a34e5308a6d2239118071 diff --combined test/test_utils.py index a17483ada,349c1107f..c68e0e968 --- a/test/test_utils.py +++ b/test/test_utils.py @@@ -16,9 -16,9 +16,10 @@@ from youtube_dl.utils import DateRange, encodeFilename, find_xpath_attr, + fix_xml_ampersands, get_meta_content, orderedSet, + PagedList, parse_duration, sanitize_filename, shell_quote, @@@ -201,18 -201,26 +202,39 @@@ class TestUtil(unittest.TestCase) self.assertEqual(parse_duration('9:12:43'), 33163) self.assertEqual(parse_duration('x:y'), None) + def test_fix_xml_ampersands(self): + self.assertEqual( + fix_xml_ampersands('"&x=y&z=a'), '"&x=y&z=a') + self.assertEqual( + fix_xml_ampersands('"&x=y&wrong;&z=a'), + '"&x=y&wrong;&z=a') + self.assertEqual( + fix_xml_ampersands('&'><"'), + '&'><"') + self.assertEqual( + fix_xml_ampersands('Ӓ᪼'), 'Ӓ᪼') + self.assertEqual(fix_xml_ampersands('&#&#'), '&#&#') + + def test_paged_list(self): + def testPL(size, pagesize, sliceargs, expected): + def get_page(pagenum): + firstid = pagenum * pagesize + upto = min(size, pagenum * pagesize + pagesize) + for i in range(firstid, upto): + yield i + + pl = PagedList(get_page, pagesize) + got = pl.getslice(*sliceargs) + self.assertEqual(got, expected) + + testPL(5, 2, (), [0, 1, 2, 3, 4]) + testPL(5, 2, (1,), [1, 2, 3, 4]) + testPL(5, 2, (2,), [2, 3, 4]) + testPL(5, 2, (4,), [4]) + testPL(5, 2, (0, 3), [0, 1, 2]) + testPL(5, 2, (1, 4), [1, 2, 3]) + testPL(5, 2, (2, 99), [2, 3, 4]) + testPL(5, 2, (20, 99), []) + if __name__ == '__main__': unittest.main() diff --combined youtube_dl/YoutubeDL.py index 87e7d21d6,2ad6f1028..a48e8ba23 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@@ -39,6 -39,7 +39,7 @@@ from .utils import locked_file, make_HTTPS_handler, MaxDownloadsReached, + PagedList, PostProcessingError, platform_name, preferredencoding, @@@ -151,9 -152,6 +152,9 @@@ class YoutubeDL(object) bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi debug_printtraffic:Print out sent and received HTTP traffic + include_ads: Download ads as well + default_search: Prepend this string if an input url is not valid. + 'auto' for elaborate guessing The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@@ -578,19 -576,27 +579,27 @@@ playlist_results = [] - n_all_entries = len(ie_result['entries']) playliststart = self.params.get('playliststart', 1) - 1 playlistend = self.params.get('playlistend', None) # For backwards compatibility, interpret -1 as whole list if playlistend == -1: playlistend = None - entries = ie_result['entries'][playliststart:playlistend] - n_entries = len(entries) - - self.to_screen( - "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" % - (ie_result['extractor'], playlist, n_all_entries, n_entries)) + if isinstance(ie_result['entries'], list): + n_all_entries = len(ie_result['entries']) + entries = ie_result['entries'][playliststart:playlistend] + n_entries = len(entries) + self.to_screen( + "[%s] playlist %s: Collected %d video ids (downloading %d of them)" % + (ie_result['extractor'], playlist, n_all_entries, n_entries)) + else: + assert isinstance(ie_result['entries'], PagedList) + entries = ie_result['entries'].getslice( + playliststart, playlistend) + n_entries = len(entries) + self.to_screen( + "[%s] playlist %s: Downloading %d videos" % + (ie_result['extractor'], playlist, n_entries)) for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries)) @@@ -637,18 -643,6 +646,18 @@@ return available_formats[-1] elif format_spec == 'worst': return available_formats[0] + elif format_spec == 'bestaudio': + audio_formats = [ + f for f in available_formats + if f.get('vcodec') == 'none'] + if audio_formats: + return audio_formats[-1] + elif format_spec == 'worstaudio': + audio_formats = [ + f for f in available_formats + if f.get('vcodec') == 'none'] + if audio_formats: + return audio_formats[0] else: extensions = ['mp4', 'flv', 'webm', '3gp'] if format_spec in extensions: @@@ -713,7 -707,7 +722,7 @@@ self.list_formats(info_dict) return - req_format = self.params.get('format', 'best') + req_format = self.params.get('format') if req_format is None: req_format = 'best' formats_to_download = [] @@@ -1106,8 -1100,6 +1115,8 @@@ res += 'audio' if fdict.get('abr') is not None: res += '@%3dk' % fdict['abr'] + if fdict.get('asr') is not None: + res += ' (%5dHz)' % fdict['asr'] if fdict.get('filesize') is not None: if res: res += ', ' diff --combined youtube_dl/extractor/youtube.py index 870b7c4ca,dd1a58f3f..57b8fdff7 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@@ -27,7 -27,7 +27,8 @@@ from ..utils import get_element_by_id, get_element_by_attribute, ExtractorError, + int_or_none, + PagedList, RegexNotFoundError, unescapeHTML, unified_strdate, @@@ -271,21 -271,6 +272,21 @@@ class YoutubeIE(YoutubeBaseInfoExtracto u"uploader_id": u"setindia" } }, + { + u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I", + u"file": u"a9LDPn-MO4I.m4a", + u"note": u"256k DASH audio (format 141) via DASH manifest", + u"params": { + u"format": "141" + }, + u"info_dict": { + u"upload_date": "20121002", + u"uploader_id": "8KVIDEO", + u"description": "No description available.", + u"uploader": "8KVIDEO", + u"title": "UHDTV TEST 8K VIDEO.mp4" + } + }, ] @@@ -1083,6 -1068,18 +1084,6 @@@ video_id = mobj.group(2) return video_id - def _get_video_url_list(self, url_map): - """ - Transform a dictionary in the format {itag:url} to a list of (itag, url) - with the requested formats. - """ - existing_formats = [x for x in self._formats if x in url_map] - if len(existing_formats) == 0: - raise ExtractorError(u'no known formats available for video') - video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats - video_url_list.reverse() # order worst to best - return video_url_list - def _extract_from_m3u8(self, manifest_url, video_id): url_map = {} def _get_urls(_manifest): @@@ -1256,6 -1253,7 +1257,6 @@@ video_annotations = self._extract_annotations(video_id) # Decide which formats to download - try: mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) if not mobj: @@@ -1280,26 -1278,9 +1281,26 @@@ except ValueError: pass + def _map_to_format_list(urlmap): + formats = [] + for itag, video_real_url in urlmap.items(): + dct = { + 'format_id': itag, + 'url': video_real_url, + 'player_url': player_url, + } + dct.update(self._formats[itag]) + formats.append(dct) + return formats + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() - video_url_list = [('_rtmp', video_info['conn'][0])] + formats = [{ + 'format_id': '_rtmp', + 'protocol': 'rtmp', + 'url': video_info['conn'][0], + 'player_url': player_url, + }] elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1: encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0] if 'rtmpe%3Dyes' in encoded_url_map: @@@ -1344,49 -1325,23 +1345,49 @@@ if 'ratebypass' not in url: url += '&ratebypass=yes' url_map[url_data['itag'][0]] = url - video_url_list = self._get_video_url_list(url_map) + formats = _map_to_format_list(url_map) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id) - video_url_list = self._get_video_url_list(url_map) + formats = _map_to_format_list(url_map) else: raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') - formats = [] - for itag, video_real_url in video_url_list: - dct = { - 'format_id': itag, - 'url': video_real_url, - 'player_url': player_url, - } - dct.update(self._formats[itag]) - formats.append(dct) + # Look for the DASH manifest + dash_manifest_url_lst = video_info.get('dashmpd') + if dash_manifest_url_lst and dash_manifest_url_lst[0]: + try: + dash_doc = self._download_xml( + dash_manifest_url_lst[0], video_id, + note=u'Downloading DASH manifest', + errnote=u'Could not download DASH manifest') + for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): + url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') + if url_el is None: + continue + format_id = r.attrib['id'] + video_url = url_el.text + filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) + f = { + 'format_id': format_id, + 'url': video_url, + 'width': int_or_none(r.attrib.get('width')), + 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), + 'asr': int_or_none(r.attrib.get('audioSamplingRate')), + 'filesize': filesize, + } + try: + existing_format = next( + fo for fo in formats + if fo['format_id'] == format_id) + except StopIteration: + f.update(self._formats.get(format_id, {})) + formats.append(f) + else: + existing_format.update(f) + + except (ExtractorError, KeyError) as e: + self.report_warning(u'Skipping DASH manifest: %s' % e, video_id) self._sort_formats(formats) @@@ -1626,44 -1581,35 +1627,35 @@@ class YoutubeUserIE(InfoExtractor) # page by page until there are no video ids - it means we got # all of them. - url_results = [] - - for pagenum in itertools.count(0): + def download_page(pagenum): start_index = pagenum * self._GDATA_PAGE_SIZE + 1 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) - page = self._download_webpage(gdata_url, username, - u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) + page = self._download_webpage( + gdata_url, username, + u'Downloading video ids from %d to %d' % ( + start_index, start_index + self._GDATA_PAGE_SIZE)) try: response = json.loads(page) except ValueError as err: raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) if 'entry' not in response['feed']: - # Number of videos is a multiple of self._MAX_RESULTS - break + return # Extract video identifiers entries = response['feed']['entry'] for entry in entries: title = entry['title']['$t'] video_id = entry['id']['$t'].split('/')[-1] - url_results.append({ + yield { '_type': 'url', 'url': video_id, 'ie_key': 'Youtube', 'id': 'video_id', 'title': title, - }) - - # A little optimization - if current page is not - # "full", ie. does not contain PAGE_SIZE video ids then - # we can assume that this page is the last one - there - # are no more ids on further pages - no need to query - # again. - - if len(entries) < self._GDATA_PAGE_SIZE: - break + } + url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) return self.playlist_result(url_results, playlist_title=username) diff --combined youtube_dl/utils.py index 3d2903986,ff124d9e8..ed5ee222f --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@@ -6,6 -6,7 +6,7 @@@ import datetim import email.utils import errno import gzip + import itertools import io import json import locale @@@ -224,7 -225,7 +225,7 @@@ if sys.version_info >= (2,7) def find_xpath_attr(node, xpath, key, val): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z]+$', key) - assert re.match(r'^[a-zA-Z0-9@\s]*$', val) + assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val) expr = xpath + u"[@%s='%s']" % (key, val) return node.find(expr) else: @@@ -1092,12 -1093,9 +1093,12 @@@ def month_by_name(name) return None -def fix_xml_all_ampersand(xml_str): +def fix_xml_ampersands(xml_str): """Replace all the '&' by '&' in XML""" - return xml_str.replace(u'&', u'&') + return re.sub( + r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)', + u'&', + xml_str) def setproctitle(title): @@@ -1131,8 -1129,8 +1132,8 @@@ class HEADRequest(compat_urllib_request return "HEAD" -def int_or_none(v): - return v if v is None else int(v) +def int_or_none(v, scale=1): + return v if v is None else (int(v) // scale) def parse_duration(s): @@@ -1164,3 -1162,46 +1165,46 @@@ def check_executable(exe, args=[]) except OSError: return False return exe + + + class PagedList(object): + def __init__(self, pagefunc, pagesize): + self._pagefunc = pagefunc + self._pagesize = pagesize + + def getslice(self, start=0, end=None): + res = [] + for pagenum in itertools.count(start // self._pagesize): + firstid = pagenum * self._pagesize + nextfirstid = pagenum * self._pagesize + self._pagesize + if start >= nextfirstid: + continue + + page_results = list(self._pagefunc(pagenum)) + + startv = ( + start % self._pagesize + if firstid <= start < nextfirstid + else 0) + + endv = ( + ((end - 1) % self._pagesize) + 1 + if (end is not None and firstid <= end <= nextfirstid) + else None) + + if startv != 0 or endv is not None: + page_results = page_results[startv:endv] + res.extend(page_results) + + # A little optimization - if current page is not "full", ie. does + # not contain page_size videos then we can assume that this page + # is the last one - there are no more ids on further pages - + # i.e. no need to query again. + if len(page_results) + startv < self._pagesize: + break + + # If we got the whole page, but the next page is not interesting, + # break out early as well + if end == nextfirstid: + break + return res