Merge branch 'paged-lists'

author Philipp Hagemeister <phihag@phihag.de>

Wed, 22 Jan 2014 19:00:16 +0000 (20:00 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Wed, 22 Jan 2014 19:00:16 +0000 (20:00 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Wed, 22 Jan 2014 19:00:16 +0000 (20:00 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Wed, 22 Jan 2014 19:00:16 +0000 (20:00 +0100)
diff --combined test/test_utils.py

index a17483ada829345e8e96f23dbdeaeea7a5451294,349c1107f4c123fd043682428b36def8690708cb..c68e0e96844213f6626c950bd89487b74223593e
--- 1/test/test_utils.py
--- 2/test/test_utils.py
+++ b/test/test_utils.py
@@@ -16,9 -16,9 +16,10 @@@ from youtube_dl.utils import 
       DateRange,
       encodeFilename,
       find_xpath_attr,
+ +    fix_xml_ampersands,
       get_meta_content,
       orderedSet,
+     PagedList,
       parse_duration,
       sanitize_filename,
       shell_quote,
@@@ -201,18 -201,26 +202,39 @@@ class TestUtil(unittest.TestCase)
           self.assertEqual(parse_duration('9:12:43'), 33163)
           self.assertEqual(parse_duration('x:y'), None)
   
+ +    def test_fix_xml_ampersands(self):
+ +        self.assertEqual(
+ +            fix_xml_ampersands('"&x=y&z=a'), '"&amp;x=y&amp;z=a')
+ +        self.assertEqual(
+ +            fix_xml_ampersands('"&amp;x=y&wrong;&z=a'),
+ +            '"&amp;x=y&amp;wrong;&amp;z=a')
+ +        self.assertEqual(
+ +            fix_xml_ampersands('&amp;&apos;&gt;&lt;&quot;'),
+ +            '&amp;&apos;&gt;&lt;&quot;')
+ +        self.assertEqual(
+ +            fix_xml_ampersands('&#1234;&#x1abC;'), '&#1234;&#x1abC;')
+ +        self.assertEqual(fix_xml_ampersands('&#&#'), '&amp;#&amp;#')
+ +
+     def test_paged_list(self):
+         def testPL(size, pagesize, sliceargs, expected):
+             def get_page(pagenum):
+                 firstid = pagenum * pagesize
+                 upto = min(size, pagenum * pagesize + pagesize)
+                 for i in range(firstid, upto):
+                     yield i
+ 
+             pl = PagedList(get_page, pagesize)
+             got = pl.getslice(*sliceargs)
+             self.assertEqual(got, expected)
+ 
+         testPL(5, 2, (), [0, 1, 2, 3, 4])
+         testPL(5, 2, (1,), [1, 2, 3, 4])
+         testPL(5, 2, (2,), [2, 3, 4])
+         testPL(5, 2, (4,), [4])
+         testPL(5, 2, (0, 3), [0, 1, 2])
+         testPL(5, 2, (1, 4), [1, 2, 3])
+         testPL(5, 2, (2, 99), [2, 3, 4])
+         testPL(5, 2, (20, 99), [])
+ 
   if __name__ == '__main__':
       unittest.main()
diff --combined youtube_dl/YoutubeDL.py

index 87e7d21d69ef93f2183cce8119f47f5069ba5ea7,2ad6f10286784585865e1445d223f39a2e5d31d7..a48e8ba23a34608072852dcb965640c55e5c37fc
--- 1/youtube_dl/YoutubeDL.py
--- 2/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@@ -39,6 -39,7 +39,7 @@@ from .utils import 
       locked_file,
       make_HTTPS_handler,
       MaxDownloadsReached,
+     PagedList,
       PostProcessingError,
       platform_name,
       preferredencoding,
@@@ -151,9 -152,6 +152,9 @@@ class YoutubeDL(object)
       bidi_workaround:   Work around buggy terminals without bidirectional text
                          support, using fridibi
       debug_printtraffic:Print out sent and received HTTP traffic
+ +    include_ads:       Download ads as well
+ +    default_search:    Prepend this string if an input url is not valid.
+ +                       'auto' for elaborate guessing
   
       The following parameters are not used by YoutubeDL itself, they are used by
       the FileDownloader:
@@@ -578,19 -576,27 +579,27 @@@
   
               playlist_results = []
   
-             n_all_entries = len(ie_result['entries'])
               playliststart = self.params.get('playliststart', 1) - 1
               playlistend = self.params.get('playlistend', None)
               # For backwards compatibility, interpret -1 as whole list
               if playlistend == -1:
                   playlistend = None
   
-             entries = ie_result['entries'][playliststart:playlistend]
-             n_entries = len(entries)
- 
-             self.to_screen(
-                 "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
-                 (ie_result['extractor'], playlist, n_all_entries, n_entries))
+             if isinstance(ie_result['entries'], list):
+                 n_all_entries = len(ie_result['entries'])
+                 entries = ie_result['entries'][playliststart:playlistend]
+                 n_entries = len(entries)
+                 self.to_screen(
+                     "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
+                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
+             else:
+                 assert isinstance(ie_result['entries'], PagedList)
+                 entries = ie_result['entries'].getslice(
+                     playliststart, playlistend)
+                 n_entries = len(entries)
+                 self.to_screen(
+                     "[%s] playlist %s: Downloading %d videos" %
+                     (ie_result['extractor'], playlist, n_entries))
   
               for i, entry in enumerate(entries, 1):
                   self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
@@@ -637,18 -643,6 +646,18 @@@
               return available_formats[-1]
           elif format_spec == 'worst':
               return available_formats[0]
+ +        elif format_spec == 'bestaudio':
+ +            audio_formats = [
+ +                f for f in available_formats
+ +                if f.get('vcodec') == 'none']
+ +            if audio_formats:
+ +                return audio_formats[-1]
+ +        elif format_spec == 'worstaudio':
+ +            audio_formats = [
+ +                f for f in available_formats
+ +                if f.get('vcodec') == 'none']
+ +            if audio_formats:
+ +                return audio_formats[0]
           else:
               extensions = ['mp4', 'flv', 'webm', '3gp']
               if format_spec in extensions:
@@@ -713,7 -707,7 +722,7 @@@
               self.list_formats(info_dict)
               return
   
- -        req_format = self.params.get('format', 'best')
+ +        req_format = self.params.get('format')
           if req_format is None:
               req_format = 'best'
           formats_to_download = []
@@@ -1106,8 -1100,6 +1115,8 @@@
                   res += 'audio'
               if fdict.get('abr') is not None:
                   res += '@%3dk' % fdict['abr']
+ +            if fdict.get('asr') is not None:
+ +                res += ' (%5dHz)' % fdict['asr']
               if fdict.get('filesize') is not None:
                   if res:
                       res += ', '
diff --combined youtube_dl/extractor/youtube.py

index 870b7c4cabb502f870b7f1aeee9b5295156b3e29,dd1a58f3fc0f5448fdf1266ef0b51840a6307d8e..57b8fdff7204b1fbb5dc6d88742704638995b0eb
--- 1/youtube_dl/extractor/youtube.py
--- 2/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@@ -27,7 -27,7 +27,8 @@@ from ..utils import 
       get_element_by_id,
       get_element_by_attribute,
       ExtractorError,
+ +    int_or_none,
+     PagedList,
       RegexNotFoundError,
       unescapeHTML,
       unified_strdate,
@@@ -271,21 -271,6 +272,21 @@@ class YoutubeIE(YoutubeBaseInfoExtracto
                   u"uploader_id": u"setindia"
               }
           },
+ +        {
+ +            u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
+ +            u"file": u"a9LDPn-MO4I.m4a",
+ +            u"note": u"256k DASH audio (format 141) via DASH manifest",
+ +            u"params": {
+ +                u"format": "141"
+ +            },
+ +            u"info_dict": {
+ +                u"upload_date": "20121002",
+ +                u"uploader_id": "8KVIDEO",
+ +                u"description": "No description available.",
+ +                u"uploader": "8KVIDEO",
+ +                u"title": "UHDTV TEST 8K VIDEO.mp4"
+ +            }
+ +        },
       ]
   
   
@@@ -1083,6 -1068,18 +1084,6 @@@
           video_id = mobj.group(2)
           return video_id
   
- -    def _get_video_url_list(self, url_map):
- -        """
- -        Transform a dictionary in the format {itag:url} to a list of (itag, url)
- -        with the requested formats.
- -        """
- -        existing_formats = [x for x in self._formats if x in url_map]
- -        if len(existing_formats) == 0:
- -            raise ExtractorError(u'no known formats available for video')
- -        video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
- -        video_url_list.reverse() # order worst to best
- -        return video_url_list
- -
       def _extract_from_m3u8(self, manifest_url, video_id):
           url_map = {}
           def _get_urls(_manifest):
@@@ -1256,6 -1253,7 +1257,6 @@@
                   video_annotations = self._extract_annotations(video_id)
   
           # Decide which formats to download
- -
           try:
               mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
               if not mobj:
@@@ -1280,26 -1278,9 +1281,26 @@@
           except ValueError:
               pass
   
+ +        def _map_to_format_list(urlmap):
+ +            formats = []
+ +            for itag, video_real_url in urlmap.items():
+ +                dct = {
+ +                    'format_id': itag,
+ +                    'url': video_real_url,
+ +                    'player_url': player_url,
+ +                }
+ +                dct.update(self._formats[itag])
+ +                formats.append(dct)
+ +            return formats
+ +
           if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
               self.report_rtmp_download()
- -            video_url_list = [('_rtmp', video_info['conn'][0])]
+ +            formats = [{
+ +                'format_id': '_rtmp',
+ +                'protocol': 'rtmp',
+ +                'url': video_info['conn'][0],
+ +                'player_url': player_url,
+ +            }]
           elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
               encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
               if 'rtmpe%3Dyes' in encoded_url_map:
@@@ -1344,49 -1325,23 +1345,49 @@@
                       if 'ratebypass' not in url:
                           url += '&ratebypass=yes'
                       url_map[url_data['itag'][0]] = url
- -            video_url_list = self._get_video_url_list(url_map)
+ +            formats = _map_to_format_list(url_map)
           elif video_info.get('hlsvp'):
               manifest_url = video_info['hlsvp'][0]
               url_map = self._extract_from_m3u8(manifest_url, video_id)
- -            video_url_list = self._get_video_url_list(url_map)
+ +            formats = _map_to_format_list(url_map)
           else:
               raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
   
- -        formats = []
- -        for itag, video_real_url in video_url_list:
- -            dct = {
- -                'format_id': itag,
- -                'url': video_real_url,
- -                'player_url': player_url,
- -            }
- -            dct.update(self._formats[itag])
- -            formats.append(dct)
+ +        # Look for the DASH manifest
+ +        dash_manifest_url_lst = video_info.get('dashmpd')
+ +        if dash_manifest_url_lst and dash_manifest_url_lst[0]:
+ +            try:
+ +                dash_doc = self._download_xml(
+ +                    dash_manifest_url_lst[0], video_id,
+ +                    note=u'Downloading DASH manifest',
+ +                    errnote=u'Could not download DASH manifest')
+ +                for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
+ +                    url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
+ +                    if url_el is None:
+ +                        continue
+ +                    format_id = r.attrib['id']
+ +                    video_url = url_el.text
+ +                    filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
+ +                    f = {
+ +                        'format_id': format_id,
+ +                        'url': video_url,
+ +                        'width': int_or_none(r.attrib.get('width')),
+ +                        'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
+ +                        'asr': int_or_none(r.attrib.get('audioSamplingRate')),
+ +                        'filesize': filesize,
+ +                    }
+ +                    try:
+ +                        existing_format = next(
+ +                            fo for fo in formats
+ +                            if fo['format_id'] == format_id)
+ +                    except StopIteration:
+ +                        f.update(self._formats.get(format_id, {}))
+ +                        formats.append(f)
+ +                    else:
+ +                        existing_format.update(f)
+ +
+ +            except (ExtractorError, KeyError) as e:
+ +                self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
   
           self._sort_formats(formats)
   
@@@ -1626,44 -1581,35 +1627,35 @@@ class YoutubeUserIE(InfoExtractor)
           # page by page until there are no video ids - it means we got
           # all of them.
   
-         url_results = []
- 
-         for pagenum in itertools.count(0):
+         def download_page(pagenum):
               start_index = pagenum * self._GDATA_PAGE_SIZE + 1
   
               gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
-             page = self._download_webpage(gdata_url, username,
-                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
+             page = self._download_webpage(
+                 gdata_url, username,
+                 u'Downloading video ids from %d to %d' % (
+                     start_index, start_index + self._GDATA_PAGE_SIZE))
   
               try:
                   response = json.loads(page)
               except ValueError as err:
                   raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
               if 'entry' not in response['feed']:
-                 # Number of videos is a multiple of self._MAX_RESULTS
-                 break
+                 return
   
               # Extract video identifiers
               entries = response['feed']['entry']
               for entry in entries:
                   title = entry['title']['$t']
                   video_id = entry['id']['$t'].split('/')[-1]
-                 url_results.append({
+                 yield {
                       '_type': 'url',
                       'url': video_id,
                       'ie_key': 'Youtube',
                       'id': 'video_id',
                       'title': title,
-                 })
- 
-             # A little optimization - if current page is not
-             # "full", ie. does not contain PAGE_SIZE video ids then
-             # we can assume that this page is the last one - there
-             # are no more ids on further pages - no need to query
-             # again.
- 
-             if len(entries) < self._GDATA_PAGE_SIZE:
-                 break
+                 }
+         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
   
           return self.playlist_result(url_results, playlist_title=username)
   
diff --combined youtube_dl/utils.py

index 3d29039867efa2b21b005a035c9cbf5c43bb5d71,ff124d9e8cbd42d04904e527750b7c3d94e74f59..ed5ee222f5ccdf75563681266777c8e9c132df80
--- 1/youtube_dl/utils.py
--- 2/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@@ -6,6 -6,7 +6,7 @@@ import datetim
   import email.utils
   import errno
   import gzip
+ import itertools
   import io
   import json
   import locale
@@@ -224,7 -225,7 +225,7 @@@ if sys.version_info >= (2,7)
       def find_xpath_attr(node, xpath, key, val):
           """ Find the xpath xpath[@key=val] """
           assert re.match(r'^[a-zA-Z]+$', key)
- -        assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
+ +        assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
           expr = xpath + u"[@%s='%s']" % (key, val)
           return node.find(expr)
   else:
@@@ -1092,12 -1093,9 +1093,12 @@@ def month_by_name(name)
           return None
   
   
- -def fix_xml_all_ampersand(xml_str):
+ +def fix_xml_ampersands(xml_str):
       """Replace all the '&' by '&amp;' in XML"""
- -    return xml_str.replace(u'&', u'&amp;')
+ +    return re.sub(
+ +        r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
+ +        u'&amp;',
+ +        xml_str)
   
   
   def setproctitle(title):
@@@ -1131,8 -1129,8 +1132,8 @@@ class HEADRequest(compat_urllib_request
           return "HEAD"
   
   
- -def int_or_none(v):
- -    return v if v is None else int(v)
+ +def int_or_none(v, scale=1):
+ +    return v if v is None else (int(v) // scale)
   
   
   def parse_duration(s):
@@@ -1164,3 -1162,46 +1165,46 @@@ def check_executable(exe, args=[])
       except OSError:
           return False
       return exe
+ 
+ 
+ class PagedList(object):
+     def __init__(self, pagefunc, pagesize):
+         self._pagefunc = pagefunc
+         self._pagesize = pagesize
+ 
+     def getslice(self, start=0, end=None):
+         res = []
+         for pagenum in itertools.count(start // self._pagesize):
+             firstid = pagenum * self._pagesize
+             nextfirstid = pagenum * self._pagesize + self._pagesize
+             if start >= nextfirstid:
+                 continue
+ 
+             page_results = list(self._pagefunc(pagenum))
+ 
+             startv = (
+                 start % self._pagesize
+                 if firstid <= start < nextfirstid
+                 else 0)
+ 
+             endv = (
+                 ((end - 1) % self._pagesize) + 1
+                 if (end is not None and firstid <= end <= nextfirstid)
+                 else None)
+ 
+             if startv != 0 or endv is not None:
+                 page_results = page_results[startv:endv]
+             res.extend(page_results)
+ 
+             # A little optimization - if current page is not "full", ie. does
+             # not contain page_size videos then we can assume that this page
+             # is the last one - there are no more ids on further pages -
+             # i.e. no need to query again.
+             if len(page_results) + startv < self._pagesize:
+                 break
+ 
+             # If we got the whole page, but the next page is not interesting,
+             # break out early as well
+             if end == nextfirstid:
+                 break
+         return res
author	Philipp Hagemeister <phihag@phihag.de>
	Wed, 22 Jan 2014 19:00:16 +0000 (20:00 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Wed, 22 Jan 2014 19:00:16 +0000 (20:00 +0100)
		1	2
test/test_utils.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/YoutubeDL.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/extractor/youtube.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/utils.py	patch \|	diff1 \|	diff2 \|	blob \| history