Merge branch 'walla' of https://github.com/lenaten/youtube-dl into lenaten-walla
[youtube-dl] / youtube_dl / extractor / common.py
index cc0a77e1ed755c2ca971246a94b766858aec8828..450c7dfd69d0000c810f18ef35741aae05221c40 100644 (file)
@@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 
 import base64
+import datetime
 import hashlib
 import json
 import netrc
@@ -15,11 +16,13 @@ from ..utils import (
     compat_http_client,
     compat_urllib_error,
     compat_urllib_parse_urlparse,
+    compat_urlparse,
     compat_str,
 
     clean_html,
     compiled_regex_type,
     ExtractorError,
+    float_or_none,
     int_or_none,
     RegexNotFoundError,
     sanitize_filename,
@@ -130,9 +133,13 @@ class InfoExtractor(object):
                     by YoutubeDL if it's missing)
     categories:     A list of categories that the video falls in, for example
                     ["Sports", "Berlin"]
+    is_live:        True, False, or None (=unknown). Whether this video is a
+                    live stream that goes on instead of a fixed-length video.
 
     Unless mentioned otherwise, the fields should be Unicode strings.
 
+    Unless mentioned otherwise, None is equivalent to absence of information.
+
     Subclasses of this one should re-define the _real_initialize() and
     _real_extract() methods and define a _VALID_URL regexp.
     Probably, they should also be added to the list of extractors.
@@ -161,6 +168,14 @@ class InfoExtractor(object):
             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
         return cls._VALID_URL_RE.match(url) is not None
 
+    @classmethod
+    def _match_id(cls, url):
+        if '_VALID_URL_RE' not in cls.__dict__:
+            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
+        m = cls._VALID_URL_RE.match(url)
+        assert m
+        return m.group('id')
+
     @classmethod
     def working(cls):
         """Getter method for _WORKING."""
@@ -321,7 +336,11 @@ class InfoExtractor(object):
         try:
             return json.loads(json_string)
         except ValueError as ve:
-            raise ExtractorError('Failed to download JSON', cause=ve)
+            errmsg = '%s: Failed to parse JSON ' % video_id
+            if fatal:
+                raise ExtractorError(errmsg, cause=ve)
+            else:
+                self.report_warning(errmsg + str(ve))
 
     def report_warning(self, msg, video_id=None):
         idstr = '' if video_id is None else '%s: ' % video_id
@@ -638,7 +657,9 @@ class InfoExtractor(object):
 
         return formats
 
-    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None):
+    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
+                              entry_protocol='m3u8', preference=None):
+
         formats = [{
             'format_id': 'm3u8-meta',
             'url': m3u8_url,
@@ -649,6 +670,11 @@ class InfoExtractor(object):
             'format_note': 'Quality selection URL',
         }]
 
+        format_url = lambda u: (
+            u
+            if re.match(r'^https?://', u)
+            else compat_urlparse.urljoin(m3u8_url, u))
+
         m3u8_doc = self._download_webpage(m3u8_url, video_id)
         last_info = None
         kv_rex = re.compile(
@@ -664,22 +690,27 @@ class InfoExtractor(object):
             elif line.startswith('#') or not line.strip():
                 continue
             else:
-                if last_info is none:
-                    formats.append({'url': line})
+                if last_info is None:
+                    formats.append({'url': format_url(line)})
                     continue
                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 
                 f = {
                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
-                    'url': line.strip(),
+                    'url': format_url(line.strip()),
                     'tbr': tbr,
                     'ext': ext,
+                    'protocol': entry_protocol,
+                    'preference': preference,
                 }
                 codecs = last_info.get('CODECS')
                 if codecs:
-                    video, audio = codecs.split(',')
-                    f['vcodec'] = video.partition('.')[0]
-                    f['acodec'] = audio.partition('.')[0]
+                    # TODO: looks like video codec is not always necessarily goes first
+                    va_codecs = codecs.split(',')
+                    if va_codecs[0]:
+                        f['vcodec'] = va_codecs[0].partition('.')[0]
+                    if len(va_codecs) > 1 and va_codecs[1]:
+                        f['acodec'] = va_codecs[1].partition('.')[0]
                 resolution = last_info.get('RESOLUTION')
                 if resolution:
                     width_str, height_str = resolution.split('x')
@@ -690,6 +721,34 @@ class InfoExtractor(object):
         self._sort_formats(formats)
         return formats
 
+    def _live_title(self, name):
+        """ Generate the title for a live video """
+        now = datetime.datetime.now()
+        now_str = now.strftime("%Y-%m-%d %H:%M")
+        return name + ' ' + now_str
+
+    def _int(self, v, name, fatal=False, **kwargs):
+        res = int_or_none(v, **kwargs)
+        if 'get_attr' in kwargs:
+            print(getattr(v, kwargs['get_attr']))
+        if res is None:
+            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+            if fatal:
+                raise ExtractorError(msg)
+            else:
+                self._downloader.report_warning(msg)
+        return res
+
+    def _float(self, v, name, fatal=False, **kwargs):
+        res = float_or_none(v, **kwargs)
+        if res is None:
+            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+            if fatal:
+                raise ExtractorError(msg)
+            else:
+                self._downloader.report_warning(msg)
+        return res
+
 
 class SearchInfoExtractor(InfoExtractor):
     """