Merge branch 'youtube-dash-manifest'
authorPhilipp Hagemeister <phihag@phihag.de>
Wed, 22 Jan 2014 18:58:31 +0000 (19:58 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Wed, 22 Jan 2014 18:58:31 +0000 (19:58 +0100)
Conflicts:
youtube_dl/extractor/youtube.py

1  2 
youtube_dl/YoutubeDL.py
youtube_dl/extractor/common.py
youtube_dl/extractor/youtube.py
youtube_dl/utils.py

diff --combined youtube_dl/YoutubeDL.py
index 1491f8908a55ee973acc2b6d4e9820a1d55281b6,11f88f1280c2d82bea2fd5fbe1f5af2bd2459d1e..87e7d21d69ef93f2183cce8119f47f5069ba5ea7
@@@ -151,9 -151,6 +151,9 @@@ class YoutubeDL(object)
      bidi_workaround:   Work around buggy terminals without bidirectional text
                         support, using fridibi
      debug_printtraffic:Print out sent and received HTTP traffic
 +    include_ads:       Download ads as well
 +    default_search:    Prepend this string if an input url is not valid.
 +                       'auto' for elaborate guessing
  
      The following parameters are not used by YoutubeDL itself, they are used by
      the FileDownloader:
              return available_formats[-1]
          elif format_spec == 'worst':
              return available_formats[0]
 +        elif format_spec == 'bestaudio':
 +            audio_formats = [
 +                f for f in available_formats
 +                if f.get('vcodec') == 'none']
 +            if audio_formats:
 +                return audio_formats[-1]
 +        elif format_spec == 'worstaudio':
 +            audio_formats = [
 +                f for f in available_formats
 +                if f.get('vcodec') == 'none']
 +            if audio_formats:
 +                return audio_formats[0]
          else:
              extensions = ['mp4', 'flv', 'webm', '3gp']
              if format_spec in extensions:
              self.list_formats(info_dict)
              return
  
 -        req_format = self.params.get('format', 'best')
 +        req_format = self.params.get('format')
          if req_format is None:
              req_format = 'best'
          formats_to_download = []
                  res += 'audio'
              if fdict.get('abr') is not None:
                  res += '@%3dk' % fdict['abr']
+             if fdict.get('asr') is not None:
+                 res += ' (%5dHz)' % fdict['asr']
              if fdict.get('filesize') is not None:
                  if res:
                      res += ', '
index 582eb4f5be4f8889994ba0d3e7abc186c9c5547e,56c54a5ce2627ecc9d488fbe12901c689fefeb3b..02a82dc57cf5b0d07586d39614ae9f03acd81b5e
@@@ -63,6 -63,7 +63,7 @@@ class InfoExtractor(object)
                      * tbr        Average bitrate of audio and video in KBit/s
                      * abr        Average audio bitrate in KBit/s
                      * acodec     Name of the audio codec in use
+                     * asr        Audio sampling rate in Hertz
                      * vbr        Average video bitrate in KBit/s
                      * vcodec     Name of the video codec in use
                      * filesize   The number of bytes, if known in advance
                            webpage_bytes[:1024])
              if m:
                  encoding = m.group(1).decode('ascii')
 +            elif webpage_bytes.startswith(b'\xff\xfe'):
 +                encoding = 'utf-16'
              else:
                  encoding = 'utf-8'
          if self._downloader.params.get('dump_intermediate_pages', False):
              except AttributeError:
                  url = url_or_request
              if len(url) > 200:
 -                h = hashlib.md5(url).hexdigest()
 +                h = u'___' + hashlib.md5(url).hexdigest()
                  url = url[:200 - len(h)] + h
              raw_filename = ('%s_%s.dump' % (video_id, url))
              filename = sanitize_filename(raw_filename, restricted=True)
index 248b30ffb329d3870e0249e4d775c3f969849ce8,b943f19f9f14ebe372fff0adf280633246293a9d..870b7c4cabb502f870b7f1aeee9b5295156b3e29
@@@ -27,7 -27,7 +27,8 @@@ from ..utils import 
      get_element_by_id,
      get_element_by_attribute,
      ExtractorError,
+     int_or_none,
 +    RegexNotFoundError,
      unescapeHTML,
      unified_strdate,
      orderedSet,
@@@ -270,6 -270,21 +271,21 @@@ class YoutubeIE(YoutubeBaseInfoExtracto
                  u"uploader_id": u"setindia"
              }
          },
+         {
+             u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
+             u"file": u"a9LDPn-MO4I.m4a",
+             u"note": u"256k DASH audio (format 141) via DASH manifest",
+             u"params": {
+                 u"format": "141"
+             },
+             u"info_dict": {
+                 u"upload_date": "20121002",
+                 u"uploader_id": "8KVIDEO",
+                 u"description": "No description available.",
+                 u"uploader": "8KVIDEO",
+                 u"title": "UHDTV TEST 8K VIDEO.mp4"
+             }
+         },
      ]
  
  
          video_id = mobj.group(2)
          return video_id
  
-     def _get_video_url_list(self, url_map):
-         """
-         Transform a dictionary in the format {itag:url} to a list of (itag, url)
-         with the requested formats.
-         """
-         existing_formats = [x for x in self._formats if x in url_map]
-         if len(existing_formats) == 0:
-             raise ExtractorError(u'no known formats available for video')
-         video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
-         video_url_list.reverse() # order worst to best
-         return video_url_list
      def _extract_from_m3u8(self, manifest_url, video_id):
          url_map = {}
          def _get_urls(_manifest):
                  video_annotations = self._extract_annotations(video_id)
  
          # Decide which formats to download
          try:
              mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
              if not mobj:
          except ValueError:
              pass
  
+         def _map_to_format_list(urlmap):
+             formats = []
+             for itag, video_real_url in urlmap.items():
+                 dct = {
+                     'format_id': itag,
+                     'url': video_real_url,
+                     'player_url': player_url,
+                 }
+                 dct.update(self._formats[itag])
+                 formats.append(dct)
+             return formats
          if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
              self.report_rtmp_download()
-             video_url_list = [('_rtmp', video_info['conn'][0])]
+             formats = [{
+                 'format_id': '_rtmp',
+                 'protocol': 'rtmp',
+                 'url': video_info['conn'][0],
+                 'player_url': player_url,
+             }]
          elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
              encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
              if 'rtmpe%3Dyes' in encoded_url_map:
                      if 'ratebypass' not in url:
                          url += '&ratebypass=yes'
                      url_map[url_data['itag'][0]] = url
-             video_url_list = self._get_video_url_list(url_map)
+             formats = _map_to_format_list(url_map)
          elif video_info.get('hlsvp'):
              manifest_url = video_info['hlsvp'][0]
              url_map = self._extract_from_m3u8(manifest_url, video_id)
-             video_url_list = self._get_video_url_list(url_map)
+             formats = _map_to_format_list(url_map)
          else:
              raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
  
-         formats = []
-         for itag, video_real_url in video_url_list:
-             dct = {
-                 'format_id': itag,
-                 'url': video_real_url,
-                 'player_url': player_url,
-             }
-             dct.update(self._formats[itag])
-             formats.append(dct)
+         # Look for the DASH manifest
+         dash_manifest_url_lst = video_info.get('dashmpd')
+         if dash_manifest_url_lst and dash_manifest_url_lst[0]:
+             try:
+                 dash_doc = self._download_xml(
+                     dash_manifest_url_lst[0], video_id,
+                     note=u'Downloading DASH manifest',
+                     errnote=u'Could not download DASH manifest')
+                 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
+                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
+                     if url_el is None:
+                         continue
+                     format_id = r.attrib['id']
+                     video_url = url_el.text
+                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
+                     f = {
+                         'format_id': format_id,
+                         'url': video_url,
+                         'width': int_or_none(r.attrib.get('width')),
+                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
+                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
+                         'filesize': filesize,
+                     }
+                     try:
+                         existing_format = next(
+                             fo for fo in formats
+                             if fo['format_id'] == format_id)
+                     except StopIteration:
+                         f.update(self._formats.get(format_id, {}))
+                         formats.append(f)
+                     else:
+                         existing_format.update(f)
+             except (ExtractorError, KeyError) as e:
+                 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
  
          self._sort_formats(formats)
  
@@@ -1449,14 -1494,7 +1495,14 @@@ class YoutubePlaylistIE(YoutubeBaseInfo
              if re.search(self._MORE_PAGES_INDICATOR, page) is None:
                  break
  
 -        playlist_title = self._og_search_title(page)
 +        try:
 +            playlist_title = self._og_search_title(page)
 +        except RegexNotFoundError:
 +            self.report_warning(
 +                u'Playlist page is missing OpenGraph title, falling back ...',
 +                playlist_id)
 +            playlist_title = self._html_search_regex(
 +                r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
  
          url_results = self._ids_to_results(ids)
          return self.playlist_result(url_results, playlist_id, playlist_title)
diff --combined youtube_dl/utils.py
index 6c00973bd34dd266ad2b9cf903e6caa9b8e83033,879394d881408bc37fd504cf27a7b67d177e98f2..3d29039867efa2b21b005a035c9cbf5c43bb5d71
@@@ -224,7 -224,7 +224,7 @@@ if sys.version_info >= (2,7)
      def find_xpath_attr(node, xpath, key, val):
          """ Find the xpath xpath[@key=val] """
          assert re.match(r'^[a-zA-Z]+$', key)
 -        assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
 +        assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
          expr = xpath + u"[@%s='%s']" % (key, val)
          return node.find(expr)
  else:
@@@ -1092,12 -1092,9 +1092,12 @@@ def month_by_name(name)
          return None
  
  
 -def fix_xml_all_ampersand(xml_str):
 +def fix_xml_ampersands(xml_str):
      """Replace all the '&' by '&amp;' in XML"""
 -    return xml_str.replace(u'&', u'&amp;')
 +    return re.sub(
 +        r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
 +        u'&amp;',
 +        xml_str)
  
  
  def setproctitle(title):
@@@ -1131,8 -1128,8 +1131,8 @@@ class HEADRequest(compat_urllib_request
          return "HEAD"
  
  
- def int_or_none(v):
-     return v if v is None else int(v)
+ def int_or_none(v, scale=1):
+     return v if v is None else (int(v) // scale)
  
  
  def parse_duration(s):