[comedycentral] Improve regexes

[youtube-dl] / youtube_dl / extractor / brightcove.py
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py

index 4ba3f7c4200169145fd03ff44f3af3c753f47c74..9ccf923a63fbd59b098f2e0edb15c025c5d0b602 100644 (file)
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -9,9 +9,11 @@ from .common import InfoExtractor
  from ..utils import (
      compat_urllib_parse,
      find_xpath_attr,
  from ..utils import (
      compat_urllib_parse,
      find_xpath_attr,
+    fix_xml_ampersands,
      compat_urlparse,
      compat_str,
      compat_urllib_request,
      compat_urlparse,
      compat_str,
      compat_urllib_request,
+    compat_parse_qs,
  
      ExtractorError,
      unsmuggle_url,
  
      ExtractorError,
      unsmuggle_url,
@@ -21,7 +23,6 @@ from ..utils import (
  class BrightcoveIE(InfoExtractor):
      _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
      _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
  class BrightcoveIE(InfoExtractor):
      _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
      _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
-    _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s'
  
      _TESTS = [
          {
  
      _TESTS = [
          {
@@ -68,7 +69,7 @@ class BrightcoveIE(InfoExtractor):
                  'description': 'md5:363109c02998fee92ec02211bd8000df',
                  'uploader': 'National Ballet of Canada',
              },
                  'description': 'md5:363109c02998fee92ec02211bd8000df',
                  'uploader': 'National Ballet of Canada',
              },
-        },
+        }
      ]
  
      @classmethod
      ]
  
      @classmethod
@@ -83,17 +84,33 @@ class BrightcoveIE(InfoExtractor):
                              lambda m: m.group(1) + '/>', object_str)
          # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
          object_str = object_str.replace('<--', '<!--')
                              lambda m: m.group(1) + '/>', object_str)
          # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
          object_str = object_str.replace('<--', '<!--')
+        object_str = fix_xml_ampersands(object_str)
  
          object_doc = xml.etree.ElementTree.fromstring(object_str)
  
          object_doc = xml.etree.ElementTree.fromstring(object_str)
-        assert 'BrightcoveExperience' in object_doc.attrib['class']
-        params = {'flashID': object_doc.attrib['id'],
-                  'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
-                  }
+
+        fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
+        if fv_el is not None:
+            flashvars = dict(
+                (k, v[0])
+                for k, v in compat_parse_qs(fv_el.attrib['value']).items())
+        else:
+            flashvars = {}
+
          def find_param(name):
          def find_param(name):
+            if name in flashvars:
+                return flashvars[name]
              node = find_xpath_attr(object_doc, './param', 'name', name)
              if node is not None:
                  return node.attrib['value']
              return None
              node = find_xpath_attr(object_doc, './param', 'name', name)
              if node is not None:
                  return node.attrib['value']
              return None
+
+        params = {}
+
+        playerID = find_param('playerID')
+        if playerID is None:
+            raise ExtractorError('Cannot find player ID')
+        params['playerID'] = playerID
+
          playerKey = find_param('playerKey')
          # Not all pages define this value
          if playerKey is not None:
          playerKey = find_param('playerKey')
          # Not all pages define this value
          if playerKey is not None:
@@ -113,9 +130,18 @@ class BrightcoveIE(InfoExtractor):
          """Try to extract the brightcove url from the wepbage, returns None
          if it can't be found
          """
          """Try to extract the brightcove url from the wepbage, returns None
          if it can't be found
          """
+
+        url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage)
+        if url_m:
+            return url_m.group(1)
+
          m_brightcove = re.search(
          m_brightcove = re.search(
-            r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>',
-            webpage, re.DOTALL)
+            r'''(?sx)<object
+            (?:
+                [^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1 |
+                [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
+            ).+?</object>''',
+            webpage)
          if m_brightcove is not None:
              return cls._build_brighcove_url(m_brightcove.group())
          else:
          if m_brightcove is not None:
              return cls._build_brighcove_url(m_brightcove.group())
          else:
@@ -156,12 +182,14 @@ class BrightcoveIE(InfoExtractor):
          info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
          info = json.loads(info)['data']
          video_info = info['programmedContent']['videoPlayer']['mediaDTO']
          info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
          info = json.loads(info)['data']
          video_info = info['programmedContent']['videoPlayer']['mediaDTO']
+        video_info['_youtubedl_adServerURL'] = info.get('adServerURL')
  
          return self._extract_video_info(video_info)
  
      def _get_playlist_info(self, player_key):
  
          return self._extract_video_info(video_info)
  
      def _get_playlist_info(self, player_key):
-        playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
-                                               player_key, 'Downloading playlist information')
+        info_url = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' % player_key
+        playlist_info = self._download_webpage(
+            info_url, player_key, 'Downloading playlist information')
  
          json_data = json.loads(playlist_info)
          if 'videoList' not in json_data:
  
          json_data = json.loads(playlist_info)
          if 'videoList' not in json_data:
@@ -175,7 +203,7 @@ class BrightcoveIE(InfoExtractor):
      def _extract_video_info(self, video_info):
          info = {
              'id': compat_str(video_info['id']),
      def _extract_video_info(self, video_info):
          info = {
              'id': compat_str(video_info['id']),
-            'title': video_info['displayName'],
+            'title': video_info['displayName'].strip(),
              'description': video_info.get('shortDescription'),
              'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
              'uploader': video_info.get('publisherName'),
              'description': video_info.get('shortDescription'),
              'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
              'uploader': video_info.get('publisherName'),
@@ -193,6 +221,23 @@ class BrightcoveIE(InfoExtractor):
              info.update({
                  'url': video_info['FLVFullLengthURL'],
              })
              info.update({
                  'url': video_info['FLVFullLengthURL'],
              })
-        else:
+
+        if self._downloader.params.get('include_ads', False):
+            adServerURL = video_info.get('_youtubedl_adServerURL')
+            if adServerURL:
+                ad_info = {
+                    '_type': 'url',
+                    'url': adServerURL,
+                }
+                if 'url' in info:
+                    return {
+                        '_type': 'playlist',
+                        'title': info['title'],
+                        'entries': [ad_info, info],
+                    }
+                else:
+                    return ad_info
+
+        if 'url' not in info and not info.get('formats'):
              raise ExtractorError('Unable to extract video url for %s' % info['id'])
          return info
              raise ExtractorError('Unable to extract video url for %s' % info['id'])
          return info