Merge branch 'walla' of https://github.com/lenaten/youtube-dl into lenaten-walla

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index cc0a77e1ed755c2ca971246a94b766858aec8828..450c7dfd69d0000c810f18ef35741aae05221c40 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1,6 +1,7 @@
  from __future__ import unicode_literals
  
  import base64
+import datetime
  import hashlib
  import json
  import netrc
@@ -15,11 +16,13 @@ from ..utils import (
      compat_http_client,
      compat_urllib_error,
      compat_urllib_parse_urlparse,
+    compat_urlparse,
      compat_str,
  
      clean_html,
      compiled_regex_type,
      ExtractorError,
+    float_or_none,
      int_or_none,
      RegexNotFoundError,
      sanitize_filename,
@@ -130,9 +133,13 @@ class InfoExtractor(object):
                      by YoutubeDL if it's missing)
      categories:     A list of categories that the video falls in, for example
                      ["Sports", "Berlin"]
+    is_live:        True, False, or None (=unknown). Whether this video is a
+                    live stream that goes on instead of a fixed-length video.
  
      Unless mentioned otherwise, the fields should be Unicode strings.
  
+    Unless mentioned otherwise, None is equivalent to absence of information.
+
      Subclasses of this one should re-define the _real_initialize() and
      _real_extract() methods and define a _VALID_URL regexp.
      Probably, they should also be added to the list of extractors.
@@ -161,6 +168,14 @@ class InfoExtractor(object):
              cls._VALID_URL_RE = re.compile(cls._VALID_URL)
          return cls._VALID_URL_RE.match(url) is not None
  
+    @classmethod
+    def _match_id(cls, url):
+        if '_VALID_URL_RE' not in cls.__dict__:
+            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
+        m = cls._VALID_URL_RE.match(url)
+        assert m
+        return m.group('id')
+
      @classmethod
      def working(cls):
          """Getter method for _WORKING."""
@@ -321,7 +336,11 @@ class InfoExtractor(object):
          try:
              return json.loads(json_string)
          except ValueError as ve:
-            raise ExtractorError('Failed to download JSON', cause=ve)
+            errmsg = '%s: Failed to parse JSON ' % video_id
+            if fatal:
+                raise ExtractorError(errmsg, cause=ve)
+            else:
+                self.report_warning(errmsg + str(ve))
  
      def report_warning(self, msg, video_id=None):
          idstr = '' if video_id is None else '%s: ' % video_id
@@ -638,7 +657,9 @@ class InfoExtractor(object):
  
          return formats
  
-    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None):
+    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
+                              entry_protocol='m3u8', preference=None):
+
          formats = [{
              'format_id': 'm3u8-meta',
              'url': m3u8_url,
@@ -649,6 +670,11 @@ class InfoExtractor(object):
              'format_note': 'Quality selection URL',
          }]
  
+        format_url = lambda u: (
+            u
+            if re.match(r'^https?://', u)
+            else compat_urlparse.urljoin(m3u8_url, u))
+
          m3u8_doc = self._download_webpage(m3u8_url, video_id)
          last_info = None
          kv_rex = re.compile(
@@ -664,22 +690,27 @@ class InfoExtractor(object):
              elif line.startswith('#') or not line.strip():
                  continue
              else:
-                if last_info is none:
-                    formats.append({'url': line})
+                if last_info is None:
+                    formats.append({'url': format_url(line)})
                      continue
                  tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
  
                  f = {
                      'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
-                    'url': line.strip(),
+                    'url': format_url(line.strip()),
                      'tbr': tbr,
                      'ext': ext,
+                    'protocol': entry_protocol,
+                    'preference': preference,
                  }
                  codecs = last_info.get('CODECS')
                  if codecs:
-                    video, audio = codecs.split(',')
-                    f['vcodec'] = video.partition('.')[0]
-                    f['acodec'] = audio.partition('.')[0]
+                    # TODO: looks like video codec is not always necessarily goes first
+                    va_codecs = codecs.split(',')
+                    if va_codecs[0]:
+                        f['vcodec'] = va_codecs[0].partition('.')[0]
+                    if len(va_codecs) > 1 and va_codecs[1]:
+                        f['acodec'] = va_codecs[1].partition('.')[0]
                  resolution = last_info.get('RESOLUTION')
                  if resolution:
                      width_str, height_str = resolution.split('x')
@@ -690,6 +721,34 @@ class InfoExtractor(object):
          self._sort_formats(formats)
          return formats
  
+    def _live_title(self, name):
+        """ Generate the title for a live video """
+        now = datetime.datetime.now()
+        now_str = now.strftime("%Y-%m-%d %H:%M")
+        return name + ' ' + now_str
+
+    def _int(self, v, name, fatal=False, **kwargs):
+        res = int_or_none(v, **kwargs)
+        if 'get_attr' in kwargs:
+            print(getattr(v, kwargs['get_attr']))
+        if res is None:
+            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+            if fatal:
+                raise ExtractorError(msg)
+            else:
+                self._downloader.report_warning(msg)
+        return res
+
+    def _float(self, v, name, fatal=False, **kwargs):
+        res = float_or_none(v, **kwargs)
+        if res is None:
+            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+            if fatal:
+                raise ExtractorError(msg)
+            else:
+                self._downloader.report_warning(msg)
+        return res
+
  
  class SearchInfoExtractor(InfoExtractor):
      """