[BostonGlobe] New. Nonstandard version of Brightcove.

author John Hawkinson <jhawk@mit.edu>

Wed, 8 Mar 2017 23:13:54 +0000 (18:13 -0500)

committer Yen Chi Hsuan <yan12125@gmail.com>

Sun, 19 Mar 2017 12:40:31 +0000 (20:40 +0800)
author John Hawkinson <jhawk@mit.edu>
Wed, 8 Mar 2017 23:13:54 +0000 (18:13 -0500)
committer Yen Chi Hsuan <yan12125@gmail.com>
Sun, 19 Mar 2017 12:40:31 +0000 (20:40 +0800)
diff --git a/youtube_dl/extractor/bostonglobe.py b/youtube_dl/extractor/bostonglobe.py

new file mode 100644 (file)

index 0000000..57882fb
--- /dev/null
+++ b/youtube_dl/extractor/bostonglobe.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+    extract_attributes,
+)
+
+
+class BostonGlobeIE(InfoExtractor):
+    _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?'
+    _TESTS = [
+        {
+            'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html',
+            'md5': '0a62181079c85c2d2b618c9a738aedaf',
+            'info_dict': {
+                'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood',
+                'id': '5320421710001',
+                'ext': 'mp4',
+                'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.',
+                'timestamp': 1486877593,
+                'upload_date': '20170212',
+                'uploader_id': '245991542',
+            },
+        },
+        {
+            # Embedded youtube video; we hand it off to the Generic extractor.
+            'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html',
+            'md5': '582b40327089d5c0c949b3c54b13c24b',
+            'info_dict': {
+                'title': "Who Is Matt Damon's Favorite Batman?",
+                'id': 'ZW1QCnlA6Qc',
+                'ext': 'mp4',
+                'upload_date': '20170217',
+                'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb',
+                'uploader': 'The Late Late Show with James Corden',
+                'uploader_id': 'TheLateLateShow',
+            },
+            'expected_warnings': ['404'],
+        },
+    ]
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+        webpage = self._download_webpage(url, page_id)
+
+        page_title = self._og_search_title(webpage, default=None)
+
+        # <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject">
+        entries = []
+        for video in re.findall(r'(?i)(<video[^>]+>)', webpage):
+            attrs = extract_attributes(video)
+
+            video_id = attrs.get('data-brightcove-video-id')
+            account_id = attrs.get('data-account')
+            player_id = attrs.get('data-player')
+            embed = attrs.get('data-embed')
+
+            if video_id and account_id and player_id and embed:
+                entries.append(
+                    'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
+                    % (account_id, player_id, embed, video_id))
+
+        if len(entries) == 0:
+            return self.url_result(url, 'Generic')
+        elif len(entries) == 1:
+            return self.url_result(entries[0], 'BrightcoveNew')
+        else:
+            return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew')
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index b51799bfa6bcc4b873b7e4fdec0f873b0bbf77bd..0852b8e8c1c15e3600ed16d5e95d9cac901f32bc 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -36,34 +36,35 @@ from ..utils import (
      clean_html,
      compiled_regex_type,
      determine_ext,
+    determine_protocol,
      error_to_compat_str,
      ExtractorError,
+    extract_attributes,
      fix_xml_ampersands,
      float_or_none,
      GeoRestrictedError,
      GeoUtils,
      int_or_none,
      js_to_json,
+    mimetype2ext,
+    orderedSet,
+    parse_codecs,
+    parse_duration,
      parse_iso8601,
+    parse_m3u8_attributes,
      RegexNotFoundError,
-    sanitize_filename,
      sanitized_Request,
+    sanitize_filename,
      unescapeHTML,
      unified_strdate,
      unified_timestamp,
+    update_Request,
+    update_url_query,
+    urljoin,
      url_basename,
      xpath_element,
      xpath_text,
      xpath_with_ns,
-    determine_protocol,
-    parse_duration,
-    mimetype2ext,
-    update_Request,
-    update_url_query,
-    parse_m3u8_attributes,
-    extract_attributes,
-    parse_codecs,
-    urljoin,
  )
  
  
@@ -714,6 +715,13 @@ class InfoExtractor(object):
              video_info['title'] = video_title
          return video_info
  
+    def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
+        urlrs = orderedSet(
+            self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
+            for m in matches)
+        return self.playlist_result(
+            urlrs, playlist_id=video_id, playlist_title=video_title)
+
      @staticmethod
      def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
          """Returns a playlist"""
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py

index 97d68d9ca64e26a693ec36e431c9ce60921f12e5..40a5c9842a63a488d43d8f20e287ee7d0d4f4802 100644 (file)
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -117,6 +117,7 @@ from .bleacherreport import (
  from .blinkx import BlinkxIE
  from .bloomberg import BloombergIE
  from .bokecc import BokeCCIE
+from .bostonglobe import BostonGlobeIE
  from .bpb import BpbIE
  from .br import BRIE
  from .bravotv import BravoTVIE
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 0fcb3fdac36d2de9cff4a1f078005e2bbbd0efe5..a71d6bac01faa69a377f01d2fdd39c927766684e 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1841,14 +1841,6 @@ class GenericIE(InfoExtractor):
          video_description = self._og_search_description(webpage, default=None)
          video_thumbnail = self._og_search_thumbnail(webpage, default=None)
  
-        # Helper method
-        def _playlist_from_matches(matches, getter=None, ie=None):
-            urlrs = orderedSet(
-                self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
-                for m in matches)
-            return self.playlist_result(
-                urlrs, playlist_id=video_id, playlist_title=video_title)
-
          # Look for Brightcove Legacy Studio embeds
          bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
          if bc_urls:
@@ -1869,28 +1861,28 @@ class GenericIE(InfoExtractor):
          # Look for Brightcove New Studio embeds
          bc_urls = BrightcoveNewIE._extract_urls(webpage)
          if bc_urls:
-            return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
+            return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
  
          # Look for ThePlatform embeds
          tp_urls = ThePlatformIE._extract_urls(webpage)
          if tp_urls:
-            return _playlist_from_matches(tp_urls, ie='ThePlatform')
+            return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')
  
          # Look for Vessel embeds
          vessel_urls = VesselIE._extract_urls(webpage)
          if vessel_urls:
-            return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key())
+            return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key())
  
          # Look for embedded rtl.nl player
          matches = re.findall(
              r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
              webpage)
          if matches:
-            return _playlist_from_matches(matches, ie='RtlNl')
+            return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
  
          vimeo_urls = VimeoIE._extract_urls(url, webpage)
          if vimeo_urls:
-            return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key())
+            return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
  
          vid_me_embed_url = self._search_regex(
              r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
@@ -1912,25 +1904,25 @@ class GenericIE(InfoExtractor):
                  (?:embed|v|p)/.+?)
              \1''', webpage)
          if matches:
-            return _playlist_from_matches(
-                matches, lambda m: unescapeHTML(m[1]))
+            return self.playlist_from_matches(
+                matches, video_id, video_title, lambda m: unescapeHTML(m[1]))
  
          # Look for lazyYT YouTube embed
          matches = re.findall(
              r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
          if matches:
-            return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
+            return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))
  
          # Look for Wordpress "YouTube Video Importer" plugin
          matches = re.findall(r'''(?x)<div[^>]+
              class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
              data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
          if matches:
-            return _playlist_from_matches(matches, lambda m: m[-1])
+            return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])
  
          matches = DailymotionIE._extract_urls(webpage)
          if matches:
-            return _playlist_from_matches(matches)
+            return self.playlist_from_matches(matches, video_id, video_title)
  
          # Look for embedded Dailymotion playlist player (#3822)
          m = re.search(
@@ -1939,8 +1931,8 @@ class GenericIE(InfoExtractor):
              playlists = re.findall(
                  r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
              if playlists:
-                return _playlist_from_matches(
-                    playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
+                return self.playlist_from_matches(
+                    playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
  
          # Look for embedded Wistia player
          match = re.search(
@@ -2047,8 +2039,9 @@ class GenericIE(InfoExtractor):
          if mobj is not None:
              embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
              if embeds:
-                return _playlist_from_matches(
-                    embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
+                return self.playlist_from_matches(
+                    embeds, video_id, video_title,
+                    getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
  
          # Look for Aparat videos
          mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
@@ -2110,13 +2103,13 @@ class GenericIE(InfoExtractor):
          # Look for funnyordie embed
          matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
          if matches:
-            return _playlist_from_matches(
-                matches, getter=unescapeHTML, ie='FunnyOrDie')
+            return self.playlist_from_matches(
+                matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
  
          # Look for BBC iPlayer embed
          matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
          if matches:
-            return _playlist_from_matches(matches, ie='BBCCoUk')
+            return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk')
  
          # Look for embedded RUTV player
          rutv_url = RUTVIE._extract_url(webpage)
@@ -2131,32 +2124,32 @@ class GenericIE(InfoExtractor):
          # Look for embedded SportBox player
          sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
          if sportbox_urls:
-            return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
+            return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed')
  
          # Look for embedded XHamster player
          xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
          if xhamster_urls:
-            return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+            return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed')
  
          # Look for embedded TNAFlixNetwork player
          tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
          if tnaflix_urls:
-            return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key())
+            return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key())
  
          # Look for embedded PornHub player
          pornhub_urls = PornHubIE._extract_urls(webpage)
          if pornhub_urls:
-            return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key())
+            return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key())
  
          # Look for embedded DrTuber player
          drtuber_urls = DrTuberIE._extract_urls(webpage)
          if drtuber_urls:
-            return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key())
+            return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key())
  
          # Look for embedded RedTube player
          redtube_urls = RedTubeIE._extract_urls(webpage)
          if redtube_urls:
-            return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key())
+            return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())
  
          # Look for embedded Tvigle player
          mobj = re.search(
@@ -2202,12 +2195,12 @@ class GenericIE(InfoExtractor):
          # Look for embedded soundcloud player
          soundcloud_urls = SoundcloudIE._extract_urls(webpage)
          if soundcloud_urls:
-            return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
+            return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
  
          # Look for tunein player
          tunein_urls = TuneInBaseIE._extract_urls(webpage)
          if tunein_urls:
-            return _playlist_from_matches(tunein_urls)
+            return self.playlist_from_matches(tunein_urls, video_id, video_title)
  
          # Look for embedded mtvservices player
          mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
@@ -2490,35 +2483,35 @@ class GenericIE(InfoExtractor):
          # Look for DBTV embeds
          dbtv_urls = DBTVIE._extract_urls(webpage)
          if dbtv_urls:
-            return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key())
+            return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key())
  
          # Look for Videa embeds
          videa_urls = VideaIE._extract_urls(webpage)
          if videa_urls:
-            return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key())
+            return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key())
  
          # Look for 20 minuten embeds
          twentymin_urls = TwentyMinutenIE._extract_urls(webpage)
          if twentymin_urls:
-            return _playlist_from_matches(
-                twentymin_urls, ie=TwentyMinutenIE.ie_key())
+            return self.playlist_from_matches(
+                twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key())
  
          # Look for Openload embeds
          openload_urls = OpenloadIE._extract_urls(webpage)
          if openload_urls:
-            return _playlist_from_matches(
-                openload_urls, ie=OpenloadIE.ie_key())
+            return self.playlist_from_matches(
+                openload_urls, video_id, video_title, ie=OpenloadIE.ie_key())
  
          # Look for VideoPress embeds
          videopress_urls = VideoPressIE._extract_urls(webpage)
          if videopress_urls:
-            return _playlist_from_matches(
-                videopress_urls, ie=VideoPressIE.ie_key())
+            return self.playlist_from_matches(
+                videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key())
  
          # Look for Rutube embeds
          rutube_urls = RutubeIE._extract_urls(webpage)
          if rutube_urls:
-            return _playlist_from_matches(
+            return self.playlist_from_matches(
                  rutube_urls, ie=RutubeIE.ie_key())
  
          # Looking for http://schema.org/VideoObject
author	John Hawkinson <jhawk@mit.edu>
	Wed, 8 Mar 2017 23:13:54 +0000 (18:13 -0500)
committer	Yen Chi Hsuan <yan12125@gmail.com>
	Sun, 19 Mar 2017 12:40:31 +0000 (20:40 +0800)
youtube_dl/extractor/bostonglobe.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/extractors.py		patch \| blob \| history
youtube_dl/extractor/generic.py		patch \| blob \| history