ceskatelevize: Closed captions support

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 6a95e2952a63a2960d795a6c6b538c9d4b6fe2b6..493afb57d89a2eade1707b09f6c5ed2ea21b680c 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -23,6 +23,7 @@ from ..utils import (
      unescapeHTML,
      unified_strdate,
      unsmuggle_url,
+    UnsupportedError,
      url_basename,
  )
  from .brightcove import BrightcoveIE
@@ -180,6 +181,14 @@ class GenericIE(InfoExtractor):
                  'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
              },
          },
+        # BBC iPlayer embeds
+        {
+            'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
+            'info_dict': {
+                'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
+            },
+            'playlist_mincount': 18,
+        },
          # RUTV embed
          {
              'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
@@ -452,7 +461,32 @@ class GenericIE(InfoExtractor):
                  'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
              },
              'playlist_mincount': 2,
-        }
+        },
+        # Direct link with incorrect MIME type
+        {
+            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+            'md5': '4ccbebe5f36706d85221f204d7eb5913',
+            'info_dict': {
+                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+                'id': '5_Lennart_Poettering_-_Systemd',
+                'ext': 'webm',
+                'title': '5_Lennart_Poettering_-_Systemd',
+                'upload_date': '20141120',
+            },
+            'expected_warnings': [
+                'URL could be a direct video link, returning it as such.'
+            ]
+        },
+        # Cinchcast embed
+        {
+            'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
+            'info_dict': {
+                'id': '7141703',
+                'ext': 'mp3',
+                'upload_date': '20141126',
+                'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
+            }
+        },
      ]
  
      def report_following_redirect(self, new_url):
@@ -606,10 +640,28 @@ class GenericIE(InfoExtractor):
          if not self._downloader.params.get('test', False) and not is_intentional:
              self._downloader.report_warning('Falling back on generic information extractor.')
  
-        if full_response:
-            webpage = self._webpage_read_content(full_response, url, video_id)
-        else:
-            webpage = self._download_webpage(url, video_id)
+        if not full_response:
+            full_response = self._request_webpage(url, video_id)
+
+        # Maybe it's a direct link to a video?
+        # Be careful not to download the whole thing!
+        first_bytes = full_response.read(512)
+        if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')):
+            self._downloader.report_warning(
+                'URL could be a direct video link, returning it as such.')
+            upload_date = unified_strdate(
+                head_response.headers.get('Last-Modified'))
+            return {
+                'id': video_id,
+                'title': os.path.splitext(url_basename(url))[0],
+                'direct': True,
+                'url': url,
+                'upload_date': upload_date,
+            }
+
+        webpage = self._webpage_read_content(
+            full_response, url, video_id, prefix=first_bytes)
+
          self.report_extraction(video_id)
  
          # Is it an RSS feed?
@@ -655,9 +707,9 @@ class GenericIE(InfoExtractor):
              r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
  
          # Helper method
-        def _playlist_from_matches(matches, getter, ie=None):
+        def _playlist_from_matches(matches, getter=None, ie=None):
              urlrs = orderedSet(
-                self.url_result(self._proto_relative_url(getter(m)), ie)
+                self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
                  for m in matches)
              return self.playlist_result(
                  urlrs, playlist_id=video_id, playlist_title=video_title)
@@ -861,6 +913,11 @@ class GenericIE(InfoExtractor):
              return _playlist_from_matches(
                  matches, getter=unescapeHTML, ie='FunnyOrDie')
  
+        # Look for BBC iPlayer embed
+        matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
+        if matches:
+            return _playlist_from_matches(matches, ie='BBCCoUk')
+
          # Look for embedded RUTV player
          rutv_url = RUTVIE._extract_url(webpage)
          if rutv_url:
@@ -928,6 +985,13 @@ class GenericIE(InfoExtractor):
          if mobj is not None:
              return self.url_result(mobj.group('url'), 'SBS')
  
+        # Look for embedded Cinchcast player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Cinchcast')
+
          mobj = re.search(
              r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
              webpage)
@@ -1007,7 +1071,7 @@ class GenericIE(InfoExtractor):
                      'url': new_url,
                  }
          if not found:
-            raise ExtractorError('Unsupported URL: %s' % url)
+            raise UnsupportedError(url)
  
          entries = []
          for video_url in found: