Merge branch 'youtube-dash-manifest'

author Philipp Hagemeister <phihag@phihag.de>

Wed, 22 Jan 2014 18:58:31 +0000 (19:58 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Wed, 22 Jan 2014 18:58:31 +0000 (19:58 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Wed, 22 Jan 2014 18:58:31 +0000 (19:58 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Wed, 22 Jan 2014 18:58:31 +0000 (19:58 +0100)
diff --combined youtube_dl/YoutubeDL.py

index 1491f8908a55ee973acc2b6d4e9820a1d55281b6,11f88f1280c2d82bea2fd5fbe1f5af2bd2459d1e..87e7d21d69ef93f2183cce8119f47f5069ba5ea7
--- 1/youtube_dl/YoutubeDL.py
--- 2/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@@ -151,9 -151,6 +151,9 @@@ class YoutubeDL(object)
       bidi_workaround:   Work around buggy terminals without bidirectional text
                          support, using fridibi
       debug_printtraffic:Print out sent and received HTTP traffic
+ +    include_ads:       Download ads as well
+ +    default_search:    Prepend this string if an input url is not valid.
+ +                       'auto' for elaborate guessing
   
       The following parameters are not used by YoutubeDL itself, they are used by
       the FileDownloader:
@@@ -637,18 -634,6 +637,18 @@@
               return available_formats[-1]
           elif format_spec == 'worst':
               return available_formats[0]
+ +        elif format_spec == 'bestaudio':
+ +            audio_formats = [
+ +                f for f in available_formats
+ +                if f.get('vcodec') == 'none']
+ +            if audio_formats:
+ +                return audio_formats[-1]
+ +        elif format_spec == 'worstaudio':
+ +            audio_formats = [
+ +                f for f in available_formats
+ +                if f.get('vcodec') == 'none']
+ +            if audio_formats:
+ +                return audio_formats[0]
           else:
               extensions = ['mp4', 'flv', 'webm', '3gp']
               if format_spec in extensions:
@@@ -713,7 -698,7 +713,7 @@@
               self.list_formats(info_dict)
               return
   
- -        req_format = self.params.get('format', 'best')
+ +        req_format = self.params.get('format')
           if req_format is None:
               req_format = 'best'
           formats_to_download = []
@@@ -1106,6 -1091,8 +1106,8 @@@
                   res += 'audio'
               if fdict.get('abr') is not None:
                   res += '@%3dk' % fdict['abr']
+             if fdict.get('asr') is not None:
+                 res += ' (%5dHz)' % fdict['asr']
               if fdict.get('filesize') is not None:
                   if res:
                       res += ', '
diff --combined youtube_dl/extractor/common.py

index 582eb4f5be4f8889994ba0d3e7abc186c9c5547e,56c54a5ce2627ecc9d488fbe12901c689fefeb3b..02a82dc57cf5b0d07586d39614ae9f03acd81b5e
--- 1/youtube_dl/extractor/common.py
--- 2/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@@ -63,6 -63,7 +63,7 @@@ class InfoExtractor(object)
                       * tbr        Average bitrate of audio and video in KBit/s
                       * abr        Average audio bitrate in KBit/s
                       * acodec     Name of the audio codec in use
+                     * asr        Audio sampling rate in Hertz
                       * vbr        Average video bitrate in KBit/s
                       * vcodec     Name of the video codec in use
                       * filesize   The number of bytes, if known in advance
@@@ -220,8 -221,6 +221,8 @@@
                             webpage_bytes[:1024])
               if m:
                   encoding = m.group(1).decode('ascii')
+ +            elif webpage_bytes.startswith(b'\xff\xfe'):
+ +                encoding = 'utf-16'
               else:
                   encoding = 'utf-8'
           if self._downloader.params.get('dump_intermediate_pages', False):
@@@ -238,7 -237,7 +239,7 @@@
               except AttributeError:
                   url = url_or_request
               if len(url) > 200:
- -                h = hashlib.md5(url).hexdigest()
+ +                h = u'___' + hashlib.md5(url).hexdigest()
                   url = url[:200 - len(h)] + h
               raw_filename = ('%s_%s.dump' % (video_id, url))
               filename = sanitize_filename(raw_filename, restricted=True)
diff --combined youtube_dl/extractor/youtube.py

index 248b30ffb329d3870e0249e4d775c3f969849ce8,b943f19f9f14ebe372fff0adf280633246293a9d..870b7c4cabb502f870b7f1aeee9b5295156b3e29
--- 1/youtube_dl/extractor/youtube.py
--- 2/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@@ -27,7 -27,7 +27,8 @@@ from ..utils import 
       get_element_by_id,
       get_element_by_attribute,
       ExtractorError,
+     int_or_none,
+ +    RegexNotFoundError,
       unescapeHTML,
       unified_strdate,
       orderedSet,
@@@ -270,6 -270,21 +271,21 @@@ class YoutubeIE(YoutubeBaseInfoExtracto
                   u"uploader_id": u"setindia"
               }
           },
+         {
+             u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
+             u"file": u"a9LDPn-MO4I.m4a",
+             u"note": u"256k DASH audio (format 141) via DASH manifest",
+             u"params": {
+                 u"format": "141"
+             },
+             u"info_dict": {
+                 u"upload_date": "20121002",
+                 u"uploader_id": "8KVIDEO",
+                 u"description": "No description available.",
+                 u"uploader": "8KVIDEO",
+                 u"title": "UHDTV TEST 8K VIDEO.mp4"
+             }
+         },
       ]
   
   
@@@ -1067,18 -1082,6 +1083,6 @@@
           video_id = mobj.group(2)
           return video_id
   
-     def _get_video_url_list(self, url_map):
-         """
-         Transform a dictionary in the format {itag:url} to a list of (itag, url)
-         with the requested formats.
-         """
-         existing_formats = [x for x in self._formats if x in url_map]
-         if len(existing_formats) == 0:
-             raise ExtractorError(u'no known formats available for video')
-         video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
-         video_url_list.reverse() # order worst to best
-         return video_url_list
- 
       def _extract_from_m3u8(self, manifest_url, video_id):
           url_map = {}
           def _get_urls(_manifest):
@@@ -1252,7 -1255,6 +1256,6 @@@
                   video_annotations = self._extract_annotations(video_id)
   
           # Decide which formats to download
- 
           try:
               mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
               if not mobj:
@@@ -1277,9 -1279,26 +1280,26 @@@
           except ValueError:
               pass
   
+         def _map_to_format_list(urlmap):
+             formats = []
+             for itag, video_real_url in urlmap.items():
+                 dct = {
+                     'format_id': itag,
+                     'url': video_real_url,
+                     'player_url': player_url,
+                 }
+                 dct.update(self._formats[itag])
+                 formats.append(dct)
+             return formats
+ 
           if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
               self.report_rtmp_download()
-             video_url_list = [('_rtmp', video_info['conn'][0])]
+             formats = [{
+                 'format_id': '_rtmp',
+                 'protocol': 'rtmp',
+                 'url': video_info['conn'][0],
+                 'player_url': player_url,
+             }]
           elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
               encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
               if 'rtmpe%3Dyes' in encoded_url_map:
@@@ -1324,23 -1343,49 +1344,49 @@@
                       if 'ratebypass' not in url:
                           url += '&ratebypass=yes'
                       url_map[url_data['itag'][0]] = url
-             video_url_list = self._get_video_url_list(url_map)
+             formats = _map_to_format_list(url_map)
           elif video_info.get('hlsvp'):
               manifest_url = video_info['hlsvp'][0]
               url_map = self._extract_from_m3u8(manifest_url, video_id)
-             video_url_list = self._get_video_url_list(url_map)
+             formats = _map_to_format_list(url_map)
           else:
               raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
   
-         formats = []
-         for itag, video_real_url in video_url_list:
-             dct = {
-                 'format_id': itag,
-                 'url': video_real_url,
-                 'player_url': player_url,
-             }
-             dct.update(self._formats[itag])
-             formats.append(dct)
+         # Look for the DASH manifest
+         dash_manifest_url_lst = video_info.get('dashmpd')
+         if dash_manifest_url_lst and dash_manifest_url_lst[0]:
+             try:
+                 dash_doc = self._download_xml(
+                     dash_manifest_url_lst[0], video_id,
+                     note=u'Downloading DASH manifest',
+                     errnote=u'Could not download DASH manifest')
+                 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
+                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
+                     if url_el is None:
+                         continue
+                     format_id = r.attrib['id']
+                     video_url = url_el.text
+                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
+                     f = {
+                         'format_id': format_id,
+                         'url': video_url,
+                         'width': int_or_none(r.attrib.get('width')),
+                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
+                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
+                         'filesize': filesize,
+                     }
+                     try:
+                         existing_format = next(
+                             fo for fo in formats
+                             if fo['format_id'] == format_id)
+                     except StopIteration:
+                         f.update(self._formats.get(format_id, {}))
+                         formats.append(f)
+                     else:
+                         existing_format.update(f)
+ 
+             except (ExtractorError, KeyError) as e:
+                 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
   
           self._sort_formats(formats)
   
@@@ -1449,14 -1494,7 +1495,14 @@@ class YoutubePlaylistIE(YoutubeBaseInfo
               if re.search(self._MORE_PAGES_INDICATOR, page) is None:
                   break
   
- -        playlist_title = self._og_search_title(page)
+ +        try:
+ +            playlist_title = self._og_search_title(page)
+ +        except RegexNotFoundError:
+ +            self.report_warning(
+ +                u'Playlist page is missing OpenGraph title, falling back ...',
+ +                playlist_id)
+ +            playlist_title = self._html_search_regex(
+ +                r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
   
           url_results = self._ids_to_results(ids)
           return self.playlist_result(url_results, playlist_id, playlist_title)
diff --combined youtube_dl/utils.py

index 6c00973bd34dd266ad2b9cf903e6caa9b8e83033,879394d881408bc37fd504cf27a7b67d177e98f2..3d29039867efa2b21b005a035c9cbf5c43bb5d71
--- 1/youtube_dl/utils.py
--- 2/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@@ -224,7 -224,7 +224,7 @@@ if sys.version_info >= (2,7)
       def find_xpath_attr(node, xpath, key, val):
           """ Find the xpath xpath[@key=val] """
           assert re.match(r'^[a-zA-Z]+$', key)
- -        assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
+ +        assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
           expr = xpath + u"[@%s='%s']" % (key, val)
           return node.find(expr)
   else:
@@@ -1092,12 -1092,9 +1092,12 @@@ def month_by_name(name)
           return None
   
   
- -def fix_xml_all_ampersand(xml_str):
+ +def fix_xml_ampersands(xml_str):
       """Replace all the '&' by '&amp;' in XML"""
- -    return xml_str.replace(u'&', u'&amp;')
+ +    return re.sub(
+ +        r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
+ +        u'&amp;',
+ +        xml_str)
   
   
   def setproctitle(title):
@@@ -1131,8 -1128,8 +1131,8 @@@ class HEADRequest(compat_urllib_request
           return "HEAD"
   
   
- def int_or_none(v):
-     return v if v is None else int(v)
+ def int_or_none(v, scale=1):
+     return v if v is None else (int(v) // scale)
   
   
   def parse_duration(s):
author	Philipp Hagemeister <phihag@phihag.de>
	Wed, 22 Jan 2014 18:58:31 +0000 (19:58 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Wed, 22 Jan 2014 18:58:31 +0000 (19:58 +0100)
		1	2
youtube_dl/YoutubeDL.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/extractor/common.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/extractor/youtube.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/utils.py	patch \|	diff1 \|	diff2 \|	blob \| history