Merge branch 'paged-lists'
authorPhilipp Hagemeister <phihag@phihag.de>
Wed, 22 Jan 2014 19:00:16 +0000 (20:00 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Wed, 22 Jan 2014 19:00:16 +0000 (20:00 +0100)
Conflicts:
test/test_utils.py
youtube_dl/extractor/youtube.py

1  2 
test/test_utils.py
youtube_dl/YoutubeDL.py
youtube_dl/extractor/youtube.py
youtube_dl/utils.py

diff --combined test/test_utils.py
index a17483ada829345e8e96f23dbdeaeea7a5451294,349c1107f4c123fd043682428b36def8690708cb..c68e0e96844213f6626c950bd89487b74223593e
@@@ -16,9 -16,9 +16,10 @@@ from youtube_dl.utils import 
      DateRange,
      encodeFilename,
      find_xpath_attr,
 +    fix_xml_ampersands,
      get_meta_content,
      orderedSet,
+     PagedList,
      parse_duration,
      sanitize_filename,
      shell_quote,
@@@ -201,18 -201,26 +202,39 @@@ class TestUtil(unittest.TestCase)
          self.assertEqual(parse_duration('9:12:43'), 33163)
          self.assertEqual(parse_duration('x:y'), None)
  
 +    def test_fix_xml_ampersands(self):
 +        self.assertEqual(
 +            fix_xml_ampersands('"&x=y&z=a'), '"&amp;x=y&amp;z=a')
 +        self.assertEqual(
 +            fix_xml_ampersands('"&amp;x=y&wrong;&z=a'),
 +            '"&amp;x=y&amp;wrong;&amp;z=a')
 +        self.assertEqual(
 +            fix_xml_ampersands('&amp;&apos;&gt;&lt;&quot;'),
 +            '&amp;&apos;&gt;&lt;&quot;')
 +        self.assertEqual(
 +            fix_xml_ampersands('&#1234;&#x1abC;'), '&#1234;&#x1abC;')
 +        self.assertEqual(fix_xml_ampersands('&#&#'), '&amp;#&amp;#')
 +
+     def test_paged_list(self):
+         def testPL(size, pagesize, sliceargs, expected):
+             def get_page(pagenum):
+                 firstid = pagenum * pagesize
+                 upto = min(size, pagenum * pagesize + pagesize)
+                 for i in range(firstid, upto):
+                     yield i
+             pl = PagedList(get_page, pagesize)
+             got = pl.getslice(*sliceargs)
+             self.assertEqual(got, expected)
+         testPL(5, 2, (), [0, 1, 2, 3, 4])
+         testPL(5, 2, (1,), [1, 2, 3, 4])
+         testPL(5, 2, (2,), [2, 3, 4])
+         testPL(5, 2, (4,), [4])
+         testPL(5, 2, (0, 3), [0, 1, 2])
+         testPL(5, 2, (1, 4), [1, 2, 3])
+         testPL(5, 2, (2, 99), [2, 3, 4])
+         testPL(5, 2, (20, 99), [])
  if __name__ == '__main__':
      unittest.main()
diff --combined youtube_dl/YoutubeDL.py
index 87e7d21d69ef93f2183cce8119f47f5069ba5ea7,2ad6f10286784585865e1445d223f39a2e5d31d7..a48e8ba23a34608072852dcb965640c55e5c37fc
@@@ -39,6 -39,7 +39,7 @@@ from .utils import 
      locked_file,
      make_HTTPS_handler,
      MaxDownloadsReached,
+     PagedList,
      PostProcessingError,
      platform_name,
      preferredencoding,
@@@ -151,9 -152,6 +152,9 @@@ class YoutubeDL(object)
      bidi_workaround:   Work around buggy terminals without bidirectional text
                         support, using fridibi
      debug_printtraffic:Print out sent and received HTTP traffic
 +    include_ads:       Download ads as well
 +    default_search:    Prepend this string if an input url is not valid.
 +                       'auto' for elaborate guessing
  
      The following parameters are not used by YoutubeDL itself, they are used by
      the FileDownloader:
  
              playlist_results = []
  
-             n_all_entries = len(ie_result['entries'])
              playliststart = self.params.get('playliststart', 1) - 1
              playlistend = self.params.get('playlistend', None)
              # For backwards compatibility, interpret -1 as whole list
              if playlistend == -1:
                  playlistend = None
  
-             entries = ie_result['entries'][playliststart:playlistend]
-             n_entries = len(entries)
-             self.to_screen(
-                 "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
-                 (ie_result['extractor'], playlist, n_all_entries, n_entries))
+             if isinstance(ie_result['entries'], list):
+                 n_all_entries = len(ie_result['entries'])
+                 entries = ie_result['entries'][playliststart:playlistend]
+                 n_entries = len(entries)
+                 self.to_screen(
+                     "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
+                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
+             else:
+                 assert isinstance(ie_result['entries'], PagedList)
+                 entries = ie_result['entries'].getslice(
+                     playliststart, playlistend)
+                 n_entries = len(entries)
+                 self.to_screen(
+                     "[%s] playlist %s: Downloading %d videos" %
+                     (ie_result['extractor'], playlist, n_entries))
  
              for i, entry in enumerate(entries, 1):
                  self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
              return available_formats[-1]
          elif format_spec == 'worst':
              return available_formats[0]
 +        elif format_spec == 'bestaudio':
 +            audio_formats = [
 +                f for f in available_formats
 +                if f.get('vcodec') == 'none']
 +            if audio_formats:
 +                return audio_formats[-1]
 +        elif format_spec == 'worstaudio':
 +            audio_formats = [
 +                f for f in available_formats
 +                if f.get('vcodec') == 'none']
 +            if audio_formats:
 +                return audio_formats[0]
          else:
              extensions = ['mp4', 'flv', 'webm', '3gp']
              if format_spec in extensions:
              self.list_formats(info_dict)
              return
  
 -        req_format = self.params.get('format', 'best')
 +        req_format = self.params.get('format')
          if req_format is None:
              req_format = 'best'
          formats_to_download = []
                  res += 'audio'
              if fdict.get('abr') is not None:
                  res += '@%3dk' % fdict['abr']
 +            if fdict.get('asr') is not None:
 +                res += ' (%5dHz)' % fdict['asr']
              if fdict.get('filesize') is not None:
                  if res:
                      res += ', '
index 870b7c4cabb502f870b7f1aeee9b5295156b3e29,dd1a58f3fc0f5448fdf1266ef0b51840a6307d8e..57b8fdff7204b1fbb5dc6d88742704638995b0eb
@@@ -27,7 -27,7 +27,8 @@@ from ..utils import 
      get_element_by_id,
      get_element_by_attribute,
      ExtractorError,
 +    int_or_none,
+     PagedList,
      RegexNotFoundError,
      unescapeHTML,
      unified_strdate,
@@@ -271,21 -271,6 +272,21 @@@ class YoutubeIE(YoutubeBaseInfoExtracto
                  u"uploader_id": u"setindia"
              }
          },
 +        {
 +            u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
 +            u"file": u"a9LDPn-MO4I.m4a",
 +            u"note": u"256k DASH audio (format 141) via DASH manifest",
 +            u"params": {
 +                u"format": "141"
 +            },
 +            u"info_dict": {
 +                u"upload_date": "20121002",
 +                u"uploader_id": "8KVIDEO",
 +                u"description": "No description available.",
 +                u"uploader": "8KVIDEO",
 +                u"title": "UHDTV TEST 8K VIDEO.mp4"
 +            }
 +        },
      ]
  
  
          video_id = mobj.group(2)
          return video_id
  
 -    def _get_video_url_list(self, url_map):
 -        """
 -        Transform a dictionary in the format {itag:url} to a list of (itag, url)
 -        with the requested formats.
 -        """
 -        existing_formats = [x for x in self._formats if x in url_map]
 -        if len(existing_formats) == 0:
 -            raise ExtractorError(u'no known formats available for video')
 -        video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 -        video_url_list.reverse() # order worst to best
 -        return video_url_list
 -
      def _extract_from_m3u8(self, manifest_url, video_id):
          url_map = {}
          def _get_urls(_manifest):
                  video_annotations = self._extract_annotations(video_id)
  
          # Decide which formats to download
 -
          try:
              mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
              if not mobj:
          except ValueError:
              pass
  
 +        def _map_to_format_list(urlmap):
 +            formats = []
 +            for itag, video_real_url in urlmap.items():
 +                dct = {
 +                    'format_id': itag,
 +                    'url': video_real_url,
 +                    'player_url': player_url,
 +                }
 +                dct.update(self._formats[itag])
 +                formats.append(dct)
 +            return formats
 +
          if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
              self.report_rtmp_download()
 -            video_url_list = [('_rtmp', video_info['conn'][0])]
 +            formats = [{
 +                'format_id': '_rtmp',
 +                'protocol': 'rtmp',
 +                'url': video_info['conn'][0],
 +                'player_url': player_url,
 +            }]
          elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
              encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
              if 'rtmpe%3Dyes' in encoded_url_map:
                      if 'ratebypass' not in url:
                          url += '&ratebypass=yes'
                      url_map[url_data['itag'][0]] = url
 -            video_url_list = self._get_video_url_list(url_map)
 +            formats = _map_to_format_list(url_map)
          elif video_info.get('hlsvp'):
              manifest_url = video_info['hlsvp'][0]
              url_map = self._extract_from_m3u8(manifest_url, video_id)
 -            video_url_list = self._get_video_url_list(url_map)
 +            formats = _map_to_format_list(url_map)
          else:
              raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
  
 -        formats = []
 -        for itag, video_real_url in video_url_list:
 -            dct = {
 -                'format_id': itag,
 -                'url': video_real_url,
 -                'player_url': player_url,
 -            }
 -            dct.update(self._formats[itag])
 -            formats.append(dct)
 +        # Look for the DASH manifest
 +        dash_manifest_url_lst = video_info.get('dashmpd')
 +        if dash_manifest_url_lst and dash_manifest_url_lst[0]:
 +            try:
 +                dash_doc = self._download_xml(
 +                    dash_manifest_url_lst[0], video_id,
 +                    note=u'Downloading DASH manifest',
 +                    errnote=u'Could not download DASH manifest')
 +                for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 +                    url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 +                    if url_el is None:
 +                        continue
 +                    format_id = r.attrib['id']
 +                    video_url = url_el.text
 +                    filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 +                    f = {
 +                        'format_id': format_id,
 +                        'url': video_url,
 +                        'width': int_or_none(r.attrib.get('width')),
 +                        'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
 +                        'asr': int_or_none(r.attrib.get('audioSamplingRate')),
 +                        'filesize': filesize,
 +                    }
 +                    try:
 +                        existing_format = next(
 +                            fo for fo in formats
 +                            if fo['format_id'] == format_id)
 +                    except StopIteration:
 +                        f.update(self._formats.get(format_id, {}))
 +                        formats.append(f)
 +                    else:
 +                        existing_format.update(f)
 +
 +            except (ExtractorError, KeyError) as e:
 +                self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
  
          self._sort_formats(formats)
  
@@@ -1626,44 -1581,35 +1627,35 @@@ class YoutubeUserIE(InfoExtractor)
          # page by page until there are no video ids - it means we got
          # all of them.
  
-         url_results = []
-         for pagenum in itertools.count(0):
+         def download_page(pagenum):
              start_index = pagenum * self._GDATA_PAGE_SIZE + 1
  
              gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
-             page = self._download_webpage(gdata_url, username,
-                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
+             page = self._download_webpage(
+                 gdata_url, username,
+                 u'Downloading video ids from %d to %d' % (
+                     start_index, start_index + self._GDATA_PAGE_SIZE))
  
              try:
                  response = json.loads(page)
              except ValueError as err:
                  raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
              if 'entry' not in response['feed']:
-                 # Number of videos is a multiple of self._MAX_RESULTS
-                 break
+                 return
  
              # Extract video identifiers
              entries = response['feed']['entry']
              for entry in entries:
                  title = entry['title']['$t']
                  video_id = entry['id']['$t'].split('/')[-1]
-                 url_results.append({
+                 yield {
                      '_type': 'url',
                      'url': video_id,
                      'ie_key': 'Youtube',
                      'id': 'video_id',
                      'title': title,
-                 })
-             # A little optimization - if current page is not
-             # "full", ie. does not contain PAGE_SIZE video ids then
-             # we can assume that this page is the last one - there
-             # are no more ids on further pages - no need to query
-             # again.
-             if len(entries) < self._GDATA_PAGE_SIZE:
-                 break
+                 }
+         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
  
          return self.playlist_result(url_results, playlist_title=username)
  
diff --combined youtube_dl/utils.py
index 3d29039867efa2b21b005a035c9cbf5c43bb5d71,ff124d9e8cbd42d04904e527750b7c3d94e74f59..ed5ee222f5ccdf75563681266777c8e9c132df80
@@@ -6,6 -6,7 +6,7 @@@ import datetim
  import email.utils
  import errno
  import gzip
+ import itertools
  import io
  import json
  import locale
@@@ -224,7 -225,7 +225,7 @@@ if sys.version_info >= (2,7)
      def find_xpath_attr(node, xpath, key, val):
          """ Find the xpath xpath[@key=val] """
          assert re.match(r'^[a-zA-Z]+$', key)
 -        assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
 +        assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
          expr = xpath + u"[@%s='%s']" % (key, val)
          return node.find(expr)
  else:
@@@ -1092,12 -1093,9 +1093,12 @@@ def month_by_name(name)
          return None
  
  
 -def fix_xml_all_ampersand(xml_str):
 +def fix_xml_ampersands(xml_str):
      """Replace all the '&' by '&amp;' in XML"""
 -    return xml_str.replace(u'&', u'&amp;')
 +    return re.sub(
 +        r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
 +        u'&amp;',
 +        xml_str)
  
  
  def setproctitle(title):
@@@ -1131,8 -1129,8 +1132,8 @@@ class HEADRequest(compat_urllib_request
          return "HEAD"
  
  
 -def int_or_none(v):
 -    return v if v is None else int(v)
 +def int_or_none(v, scale=1):
 +    return v if v is None else (int(v) // scale)
  
  
  def parse_duration(s):
@@@ -1164,3 -1162,46 +1165,46 @@@ def check_executable(exe, args=[])
      except OSError:
          return False
      return exe
+ class PagedList(object):
+     def __init__(self, pagefunc, pagesize):
+         self._pagefunc = pagefunc
+         self._pagesize = pagesize
+     def getslice(self, start=0, end=None):
+         res = []
+         for pagenum in itertools.count(start // self._pagesize):
+             firstid = pagenum * self._pagesize
+             nextfirstid = pagenum * self._pagesize + self._pagesize
+             if start >= nextfirstid:
+                 continue
+             page_results = list(self._pagefunc(pagenum))
+             startv = (
+                 start % self._pagesize
+                 if firstid <= start < nextfirstid
+                 else 0)
+             endv = (
+                 ((end - 1) % self._pagesize) + 1
+                 if (end is not None and firstid <= end <= nextfirstid)
+                 else None)
+             if startv != 0 or endv is not None:
+                 page_results = page_results[startv:endv]
+             res.extend(page_results)
+             # A little optimization - if current page is not "full", ie. does
+             # not contain page_size videos then we can assume that this page
+             # is the last one - there are no more ids on further pages -
+             # i.e. no need to query again.
+             if len(page_results) + startv < self._pagesize:
+                 break
+             # If we got the whole page, but the next page is not interesting,
+             # break out early as well
+             if end == nextfirstid:
+                 break
+         return res