Unify coding cookie

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 6c4af64243c86d49de81e9c9c333637cc1cbe40d..9ea306e3a4313bbc78f43b5d6fce7c6216996561 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1,4 +1,4 @@
-# encoding: utf-8
+# coding: utf-8
  
  from __future__ import unicode_literals
  
@@ -72,6 +72,8 @@ from .kaltura import KalturaIE
  from .eagleplatform import EaglePlatformIE
  from .facebook import FacebookIE
  from .soundcloud import SoundcloudIE
+from .vbox7 import Vbox7IE
+from .dbtv import DBTVIE
  
  
  class GenericIE(InfoExtractor):
@@ -102,7 +104,8 @@ class GenericIE(InfoExtractor):
              },
              'expected_warnings': [
                  'URL could be a direct video link, returning it as such.'
-            ]
+            ],
+            'skip': 'URL invalid',
          },
          # Direct download with broken HEAD
          {
@@ -266,7 +269,8 @@ class GenericIE(InfoExtractor):
              'params': {
                  # m3u8 downloads
                  'skip_download': True,
-            }
+            },
+            'skip': 'video gone',
          },
          # m3u8 served with Content-Type: text/plain
          {
@@ -281,7 +285,8 @@ class GenericIE(InfoExtractor):
              'params': {
                  # m3u8 downloads
                  'skip_download': True,
-            }
+            },
+            'skip': 'video gone',
          },
          # google redirect
          {
@@ -366,6 +371,7 @@ class GenericIE(InfoExtractor):
                  'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
              },
              'add_ie': ['BrightcoveLegacy'],
+            'skip': 'video gone',
          },
          {
              'url': 'http://www.championat.com/video/football/v/87/87499.html',
@@ -419,6 +425,7 @@ class GenericIE(InfoExtractor):
              'params': {
                  'skip_download': True,
              },
+            'skip': 'movie expired',
          },
          # embed.ly video
          {
@@ -446,6 +453,8 @@ class GenericIE(InfoExtractor):
                  'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
                  'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
              },
+            # HEAD requests lead to endless 301, while GET is OK
+            'expected_warnings': ['301'],
          },
          # RUTV embed
          {
@@ -520,6 +529,9 @@ class GenericIE(InfoExtractor):
                  'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
              },
              'playlist_mincount': 7,
+            # This forum does not allow <iframe> syntaxes anymore
+            # Now HTML tags are displayed as-is
+            'skip': 'No videos on this page',
          },
          # Embedded TED video
          {
@@ -568,7 +580,8 @@ class GenericIE(InfoExtractor):
              },
              'params': {
                  'skip_download': 'Requires rtmpdump'
-            }
+            },
+            'skip': 'video gone',
          },
          # francetv embed
          {
@@ -785,6 +798,15 @@ class GenericIE(InfoExtractor):
                  'upload_date': '20141029',
              }
          },
+        # Soundcloud multiple embeds
+        {
+            'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809',
+            'info_dict': {
+                'id': '52809',
+                'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance  | TAB + AUDIO',
+            },
+            'playlist_mincount': 7,
+        },
          # Livestream embed
          {
              'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
@@ -1347,6 +1369,11 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': ['Vimeo'],
          },
+        {
+            # generic vimeo embed that requires original URL passed as Referer
+            'url': 'http://racing4everyone.eu/2016/07/30/formula-1-2016-round12-germany/',
+            'only_matching': True,
+        },
          {
              'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video',
              'md5': 'b96f2f71b359a8ecd05ce4e1daa72365',
@@ -1364,6 +1391,27 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': [ArkenaIE.ie_key()],
          },
+        {
+            'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/',
+            'info_dict': {
+                'id': '1c7141f46c',
+                'ext': 'mp4',
+                'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': [Vbox7IE.ie_key()],
+        },
+        {
+            # DBTV embeds
+            'url': 'http://www.dagbladet.no/2016/02/23/nyheter/nordlys/ski/troms/ver/43254897/',
+            'info_dict': {
+                'id': '43254897',
+                'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans',
+            },
+            'playlist_mincount': 3,
+        },
          # {
          #     # TODO: find another test
          #     # http://schema.org/VideoObject
@@ -1609,7 +1657,9 @@ class GenericIE(InfoExtractor):
                  return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
                  info_dict['formats'] = self._parse_mpd_formats(
-                    doc, video_id, mpd_base_url=url.rpartition('/')[0])
+                    doc, video_id,
+                    mpd_base_url=full_response.geturl().rpartition('/')[0],
+                    mpd_url=url)
                  self._sort_formats(info_dict['formats'])
                  return info_dict
              elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
@@ -2198,6 +2248,43 @@ class GenericIE(InfoExtractor):
              return self.url_result(
                  self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine')
  
+        # Look for VODPlatform embeds
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vod-platform\.net/[eE]mbed/.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(
+                self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform')
+
+        # Look for Mangomolo embeds
+        mobj = re.search(
+            r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?admin\.mangomolo\.com/analytics/index\.php/customers/embed/
+                (?:
+                    video\?.*?\bid=(?P<video_id>\d+)|
+                    index\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)
+                ).+?)\1''', webpage)
+        if mobj is not None:
+            info = {
+                '_type': 'url_transparent',
+                'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))),
+                'title': video_title,
+                'description': video_description,
+                'thumbnail': video_thumbnail,
+                'uploader': video_uploader,
+            }
+            video_id = mobj.group('video_id')
+            if video_id:
+                info.update({
+                    'ie_key': 'MangomoloVideo',
+                    'id': video_id,
+                })
+            else:
+                info.update({
+                    'ie_key': 'MangomoloLive',
+                    'id': mobj.group('channel_id'),
+                })
+            return info
+
          # Look for Instagram embeds
          instagram_embed_url = InstagramIE._extract_embed_url(webpage)
          if instagram_embed_url is not None:
@@ -2222,10 +2309,20 @@ class GenericIE(InfoExtractor):
                  'uploader': video_uploader,
              }
  
+        # Look for VBOX7 embeds
+        vbox7_url = Vbox7IE._extract_url(webpage)
+        if vbox7_url:
+            return self.url_result(vbox7_url, Vbox7IE.ie_key())
+
+        # Look for DBTV embeds
+        dbtv_urls = DBTVIE._extract_urls(webpage)
+        if dbtv_urls:
+            return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key())
+
          # Looking for http://schema.org/VideoObject
          json_ld = self._search_json_ld(
-            webpage, video_id, default=None, expected_type='VideoObject')
-        if json_ld and json_ld.get('url'):
+            webpage, video_id, default={}, expected_type='VideoObject')
+        if json_ld.get('url'):
              info_dict.update({
                  'title': video_title or info_dict['title'],
                  'description': video_description,
@@ -2235,12 +2332,23 @@ class GenericIE(InfoExtractor):
              info_dict.update(json_ld)
              return info_dict
  
+        # Look for HTML5 media
+        entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
+        if entries:
+            for entry in entries:
+                entry.update({
+                    'id': video_id,
+                    'title': video_title,
+                })
+                self._sort_formats(entry['formats'])
+            return self.playlist_result(entries)
+
          def check_video(vurl):
              if YoutubeIE.suitable(vurl):
                  return True
              vpath = compat_urlparse.urlparse(vurl).path
              vext = determine_ext(vpath)
-            return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
+            return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js')
  
          def filter_video(urls):
              return list(filter(check_video, urls))
@@ -2290,9 +2398,6 @@ class GenericIE(InfoExtractor):
              # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
              if m_video_type is not None:
                  found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
-        if not found:
-            # HTML5 video
-            found = re.findall(r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
          if not found:
              REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
              found = re.search(