Merge branch 'remitamine-brightcove_in_page_embed'

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index ec748ed9f4c7b81f4e49270a9857ba429a2cd108..51516a38a14646f29bdc160b60e0298ad1234739 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -4,10 +4,12 @@ from __future__ import unicode_literals
  
  import os
  import re
+import sys
  
  from .common import InfoExtractor
  from .youtube import YoutubeIE
  from ..compat import (
+    compat_etree_fromstring,
      compat_urllib_parse_unquote,
      compat_urllib_request,
      compat_urlparse,
@@ -20,7 +22,6 @@ from ..utils import (
      HEADRequest,
      is_html,
      orderedSet,
-    parse_xml,
      smuggle_url,
      unescapeHTML,
      unified_strdate,
@@ -29,7 +30,10 @@ from ..utils import (
      url_basename,
      xpath_text,
  )
-from .brightcove import BrightcoveIE
+from .brightcove import (
+    BrightcoveLegacyIE,
+    BrightcoveNewIE,
+)
  from .nbc import NBCSportsVPlayerIE
  from .ooyala import OoyalaIE
  from .rutv import RUTVIE
@@ -49,6 +53,7 @@ from .dailymotion import DailymotionCloudIE
  from .onionstudios import OnionStudiosIE
  from .snagfilms import SnagFilmsEmbedIE
  from .screenwavemedia import ScreenwaveMediaIE
+from .mtv import MTVServicesEmbeddedIE
  
  
  class GenericIE(InfoExtractor):
@@ -139,6 +144,7 @@ class GenericIE(InfoExtractor):
                  'ext': 'mp4',
                  'title': 'Automatics, robotics and biocybernetics',
                  'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+                'upload_date': '20130627',
                  'formats': 'mincount:16',
                  'subtitles': 'mincount:1',
              },
@@ -230,6 +236,22 @@ class GenericIE(InfoExtractor):
                  'skip_download': False,
              }
          },
+        {
+            # redirect in Refresh HTTP header
+            'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1',
+            'info_dict': {
+                'id': 'pO8h3EaFRdo',
+                'ext': 'mp4',
+                'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set',
+                'description': 'md5:6294cc1af09c4049e0652b51a2df10d5',
+                'upload_date': '20150917',
+                'uploader_id': 'brtvofficial',
+                'uploader': 'Boiler Room',
+            },
+            'params': {
+                'skip_download': False,
+            },
+        },
          {
              'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
              'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
@@ -256,7 +278,7 @@ class GenericIE(InfoExtractor):
          # it also tests brightcove videos that need to set the 'Referer' in the
          # http requests
          {
-            'add_ie': ['Brightcove'],
+            'add_ie': ['BrightcoveLegacy'],
              'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
              'info_dict': {
                  'id': '2765128793001',
@@ -280,7 +302,7 @@ class GenericIE(InfoExtractor):
                  'uploader': 'thestar.com',
                  'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
              },
-            'add_ie': ['Brightcove'],
+            'add_ie': ['BrightcoveLegacy'],
          },
          {
              'url': 'http://www.championat.com/video/football/v/87/87499.html',
@@ -295,7 +317,7 @@ class GenericIE(InfoExtractor):
          },
          {
              # https://github.com/rg3/youtube-dl/issues/3541
-            'add_ie': ['Brightcove'],
+            'add_ie': ['BrightcoveLegacy'],
              'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
              'info_dict': {
                  'id': '3866516442001',
@@ -1012,6 +1034,17 @@ class GenericIE(InfoExtractor):
                  'ext': 'mp4',
                  'title': 'cinemasnob',
              },
+        },
+        # BrightcoveInPageEmbed embed
+        {
+            'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/',
+            'info_dict': {
+                'id': '4238694884001',
+                'ext': 'flv',
+                'title': 'Tabletop: Dread, Last Thoughts',
+                'description': 'Tabletop: Dread, Last Thoughts',
+                'duration': 51690,
+            },
          }
      ]
  
@@ -1219,7 +1252,7 @@ class GenericIE(InfoExtractor):
  
          # Is it an RSS feed, a SMIL file or a XSPF playlist?
          try:
-            doc = parse_xml(webpage)
+            doc = compat_etree_fromstring(webpage.encode('utf-8'))
              if doc.tag == 'rss':
                  return self._extract_rss(url, video_id, doc)
              elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
@@ -1271,14 +1304,14 @@ class GenericIE(InfoExtractor):
              return self.playlist_result(
                  urlrs, playlist_id=video_id, playlist_title=video_title)
  
-        # Look for BrightCove:
-        bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
+        # Look for Brightcove Legacy Studio embeds
+        bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
          if bc_urls:
              self.to_screen('Brightcove video detected.')
              entries = [{
                  '_type': 'url',
                  'url': smuggle_url(bc_url, {'Referer': url}),
-                'ie_key': 'Brightcove'
+                'ie_key': 'BrightcoveLegacy'
              } for bc_url in bc_urls]
  
              return {
@@ -1288,6 +1321,11 @@ class GenericIE(InfoExtractor):
                  'entries': entries,
              }
  
+        # Look for Brightcove New Studio embeds
+        bc_urls = BrightcoveNewIE._extract_urls(webpage)
+        if bc_urls:
+            return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
+
          # Look for embedded rtl.nl player
          matches = re.findall(
              r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
@@ -1594,12 +1632,9 @@ class GenericIE(InfoExtractor):
              return self.url_result(url, ie='Vulture')
  
          # Look for embedded mtvservices player
-        mobj = re.search(
-            r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
-            webpage)
-        if mobj is not None:
-            url = unescapeHTML(mobj.group('url'))
-            return self.url_result(url, ie='MTVServicesEmbedded')
+        mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
+        if mtvservices_url:
+            return self.url_result(mtvservices_url, ie='MTVServicesEmbedded')
  
          # Look for embedded yahoo player
          mobj = re.search(
@@ -1638,7 +1673,7 @@ class GenericIE(InfoExtractor):
              return self.url_result(mobj.group('url'), 'MLB')
  
          mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
+            r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
              webpage)
          if mobj is not None:
              return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
@@ -1656,8 +1691,8 @@ class GenericIE(InfoExtractor):
              return self.url_result(mobj.group('url'), 'Zapiks')
  
          # Look for Kaltura embeds
-        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or
-                re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage))
+        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or
+                re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
          if mobj is not None:
              return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
  
@@ -1808,6 +1843,9 @@ class GenericIE(InfoExtractor):
                  # Look also in Refresh HTTP header
                  refresh_header = head_response.headers.get('Refresh')
                  if refresh_header:
+                    # In python 2 response HTTP headers are bytestrings
+                    if sys.version_info < (3, 0) and isinstance(refresh_header, str):
+                        refresh_header = refresh_header.decode('iso-8859-1')
                      found = re.search(REDIRECT_REGEX, refresh_header)
              if found:
                  new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))