X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=dbae75406233ae8d5d2db2acf2090e6522a91295;hb=9d5332518c51fb0df69b844f9258e61bd7ecd390;hp=16ae4b98ffe09c97f604981bf6c2ce9dc1e44e03;hpb=8b9848ac5678356757f67a412f7ed89a0f559be7;p=youtube-dl

diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 16ae4b98f..dbae75406 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -15,6 +15,7 @@ import xml.etree.ElementTree
 from ..compat import (
     compat_cookiejar,
     compat_cookies,
+    compat_getpass,
     compat_HTTPError,
     compat_http_client,
     compat_urllib_error,
@@ -38,6 +39,7 @@ from ..utils import (
     RegexNotFoundError,
     sanitize_filename,
     unescapeHTML,
+    unified_strdate,
     url_basename,
     xpath_text,
     xpath_with_ns,
@@ -151,6 +153,7 @@ class InfoExtractor(object):
     description:    Full video description.
     uploader:       Full name of the video uploader.
     creator:        The main artist who created the video.
+    release_date:   The date (YYYYMMDD) when the video was released.
     timestamp:      UNIX timestamp of the moment the video became available.
     upload_date:    Video upload date (YYYYMMDD).
                     If not explicitly set, calculated from timestamp.
@@ -162,6 +165,7 @@ class InfoExtractor(object):
                     with the "ext" entry and one of:
                         * "data": The subtitles file contents
                         * "url": A URL pointing to the subtitles file
+                    "ext" will be calculated from URL if missing
     automatic_captions: Like 'subtitles', used by the YoutubeIE for
                     automatically generated captions
     duration:       Length of the video in seconds, as an integer.
@@ -509,6 +513,18 @@ class InfoExtractor(object):
         """Report attempt to log in."""
         self.to_screen('Logging in')
 
+    @staticmethod
+    def raise_login_required(msg='This video is only available for registered users'):
+        raise ExtractorError(
+            '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
+            expected=True)
+
+    @staticmethod
+    def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
+        raise ExtractorError(
+            '%s. You might want to use --proxy to workaround.' % msg,
+            expected=True)
+
     # Methods for following #608
     @staticmethod
     def url_result(url, ie=None, video_id=None, video_title=None):
@@ -610,7 +626,7 @@ class InfoExtractor(object):
 
         return (username, password)
 
-    def _get_tfa_info(self):
+    def _get_tfa_info(self, note='two-factor verification code'):
         """
         Get the two-factor authentication info
         TODO - asking the user will be required for sms/phone verify
@@ -624,7 +640,7 @@ class InfoExtractor(object):
         if downloader_params.get('twofactor', None) is not None:
             return downloader_params['twofactor']
 
-        return None
+        return compat_getpass('Type %s and press [Return]: ' % note)
 
     # Helper functions for extracting OpenGraph info
     @staticmethod
@@ -724,20 +740,23 @@ class InfoExtractor(object):
 
     @staticmethod
     def _hidden_inputs(html):
-        return dict([
-            (input.group('name'), input.group('value')) for input in re.finditer(
-                r'''(?x)
-                    <input\s+
-                        type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+
-                        name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+
-                        (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)?
-                        value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value)
-                ''', html)
-        ])
+        html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
+        hidden_inputs = {}
+        for input in re.findall(r'(?i)<input([^>]+)>', html):
+            if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
+                continue
+            name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
+            if not name:
+                continue
+            value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
+            if not value:
+                continue
+            hidden_inputs[name.group('value')] = value.group('value')
+        return hidden_inputs
 
     def _form_hidden_inputs(self, form_id, html):
         form = self._search_regex(
-            r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
+            r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
             html, '%s form' % form_id, group='form')
         return self._hidden_inputs(form)
 
@@ -852,13 +871,18 @@ class InfoExtractor(object):
         time.sleep(timeout)
 
     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
-                             transform_source=lambda s: fix_xml_ampersands(s).strip()):
+                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
+                             fatal=True):
         manifest = self._download_xml(
             manifest_url, video_id, 'Downloading f4m manifest',
             'Unable to download f4m manifest',
             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
-            transform_source=transform_source)
+            transform_source=transform_source,
+            fatal=fatal)
+
+        if manifest is False:
+            return manifest
 
         formats = []
         manifest_version = '1.0'
@@ -879,7 +903,10 @@ class InfoExtractor(object):
                 # may differ leading to inability to resolve the format by requested
                 # bitrate in f4m downloader
                 if determine_ext(manifest_url) == 'f4m':
-                    formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+                    f4m_formats = self._extract_f4m_formats(
+                        manifest_url, video_id, preference, f4m_id, fatal=fatal)
+                    if f4m_formats:
+                        formats.extend(f4m_formats)
                     continue
             tbr = int_or_none(media_el.attrib.get('bitrate'))
             formats.append({
@@ -1027,6 +1054,7 @@ class InfoExtractor(object):
         video_id = os.path.splitext(url_basename(smil_url))[0]
         title = None
         description = None
+        upload_date = None
         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
             name = meta.attrib.get('name')
             content = meta.attrib.get('content')
@@ -1036,11 +1064,22 @@ class InfoExtractor(object):
                 title = content
             elif not description and name in ('description', 'abstract'):
                 description = content
+            elif not upload_date and name == 'date':
+                upload_date = unified_strdate(content)
+
+        thumbnails = [{
+            'id': image.get('type'),
+            'url': image.get('src'),
+            'width': int_or_none(image.get('width')),
+            'height': int_or_none(image.get('height')),
+        } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
 
         return {
             'id': video_id,
             'title': title or video_id,
             'description': description,
+            'upload_date': upload_date,
+            'thumbnails': thumbnails,
             'formats': formats,
             'subtitles': subtitles,
         }
@@ -1049,7 +1088,7 @@ class InfoExtractor(object):
         return self._search_regex(
             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
 
-    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
+    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
         base = smil_url
         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
             b = meta.get('base') or meta.get('httpBase')
@@ -1067,7 +1106,7 @@ class InfoExtractor(object):
             if not src:
                 continue
 
-            bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+            bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
             filesize = int_or_none(video.get('size') or video.get('fileSize'))
             width = int_or_none(video.get('width'))
             height = int_or_none(video.get('height'))
@@ -1088,13 +1127,21 @@ class InfoExtractor(object):
                     'width': width,
                     'height': height,
                 })
+                if transform_rtmp_url:
+                    streamer, src = transform_rtmp_url(streamer, src)
+                    formats[-1].update({
+                        'url': streamer,
+                        'play_path': src,
+                    })
                 continue
 
             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
 
             if proto == 'm3u8' or src_ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    src_url, video_id, ext or 'mp4', m3u8_id='hls'))
+                m3u8_formats = self._extract_m3u8_formats(
+                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+                if m3u8_formats:
+                    formats.extend(m3u8_formats)
                 continue
 
             if src_ext == 'f4m':
@@ -1106,10 +1153,12 @@ class InfoExtractor(object):
                     }
                 f4m_url += '&' if '?' in f4m_url else '?'
                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
-                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
+                f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)
+                if f4m_formats:
+                    formats.extend(f4m_formats)
                 continue
 
-            if src_url.startswith('http'):
+            if src_url.startswith('http') and self._is_valid_url(src, video_id):
                 http_count += 1
                 formats.append({
                     'url': src_url,
@@ -1126,7 +1175,7 @@ class InfoExtractor(object):
 
         return formats
 
-    def _parse_smil_subtitles(self, smil, namespace=None):
+    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
         subtitles = {}
         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
             src = textstream.get('src')
@@ -1135,9 +1184,14 @@ class InfoExtractor(object):
             ext = textstream.get('ext') or determine_ext(src)
             if not ext:
                 type_ = textstream.get('type')
-                if type_ == 'text/srt':
-                    ext = 'srt'
-            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
+                SUBTITLES_TYPES = {
+                    'text/vtt': 'vtt',
+                    'text/srt': 'srt',
+                    'application/smptett+xml': 'tt',
+                }
+                if type_ in SUBTITLES_TYPES:
+                    ext = SUBTITLES_TYPES[type_]
+            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
             subtitles.setdefault(lang, []).append({
                 'url': src,
                 'ext': ext,
@@ -1265,6 +1319,23 @@ class InfoExtractor(object):
     def _get_subtitles(self, *args, **kwargs):
         raise NotImplementedError("This method must be implemented by subclasses")
 
+    @staticmethod
+    def _merge_subtitle_items(subtitle_list1, subtitle_list2):
+        """ Merge subtitle items for one language. Items with duplicated URLs
+        will be dropped. """
+        list1_urls = set([item['url'] for item in subtitle_list1])
+        ret = list(subtitle_list1)
+        ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
+        return ret
+
+    @classmethod
+    def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
+        """ Merge two subtitle dictionaries, language by language. """
+        ret = dict(subtitle_dict1)
+        for lang in subtitle_dict2:
+            ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
+        return ret
+
     def extract_automatic_captions(self, *args, **kwargs):
         if (self._downloader.params.get('writeautomaticsub', False) or
                 self._downloader.params.get('listsubtitles')):