[Yahoo/NBCSports] Generalize NBC sports info extractor

author Yen Chi Hsuan <yan12125@gmail.com>

Mon, 30 Mar 2015 18:47:18 +0000 (02:47 +0800)

committer Yen Chi Hsuan <yan12125@gmail.com>

Mon, 30 Mar 2015 18:47:18 +0000 (02:47 +0800)
author Yen Chi Hsuan <yan12125@gmail.com>
Mon, 30 Mar 2015 18:47:18 +0000 (02:47 +0800)
committer Yen Chi Hsuan <yan12125@gmail.com>
Mon, 30 Mar 2015 18:47:18 +0000 (02:47 +0800)
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index 5d0d2a9bc6f996364aa7703cfab67077eda8c093..b113aaec6d4ce5c9eba17d5b25e448da33795238 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -310,6 +310,7 @@ from .naver import NaverIE
  from .nba import NBAIE
  from .nbc import (
      NBCIE,
+    NBCSportsVPlayerIE,
      NBCSportsIE,
      NBCNewsIE,
  )
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py

index 033bf71f0325899a76f04fede12313be12f7e872..c8dd72ab489652c12c661cf00cb6fa32bf9870a2 100644 (file)
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -50,7 +50,7 @@ class NBCIE(InfoExtractor):
          return self.url_result(theplatform_url)
  
  
-class NBCSportsIE(InfoExtractor):
+class NBCSportsVPlayerIE(InfoExtractor):
      _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z]+)'
  
      _TEST = {
@@ -64,6 +64,13 @@ class NBCSportsIE(InfoExtractor):
          }
      }
  
+    @staticmethod
+    def _extract_url(webpage):
+        iframe_m = re.search(
+            r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage)
+        if iframe_m:
+            return iframe_m.group('url')
+
      def _real_extract(self, url):
          video_id = self._match_id(url)
          webpage = self._download_webpage(url, video_id)
@@ -71,6 +78,28 @@ class NBCSportsIE(InfoExtractor):
          return self.url_result(theplatform_url, 'ThePlatform')
  
  
+class NBCSportsIE(InfoExtractor):
+    # Does not include https becuase its certificate is invalid
+    _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
+
+    _TEST = {
+        'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke',
+        'md5': 'ba6c93f96b67bf05344f78bd523dac0f',
+        'info_dict': {
+            'id': 'PHJSaFWbrTY9',
+            'ext': 'flv',
+            'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
+            'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        return self.url_result(
+            NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer')
+
+
  class NBCNewsIE(InfoExtractor):
      _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/
          (?:video/.+?/(?P<id>\d+)|
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py

index 6e72f1e5552d38d4a2baf8d2b14f3ce01ae3342c..43776d1e6b30d3f413c2490b6c7c6161f4cf1efa 100644 (file)
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -17,6 +17,8 @@ from ..utils import (
      int_or_none,
  )
  
+from .nbc import NBCSportsVPlayerIE
+
  
  class YahooIE(InfoExtractor):
      IE_DESC = 'Yahoo screen and movies'
@@ -132,6 +134,7 @@ class YahooIE(InfoExtractor):
          }, {
              'note': 'NBC Sports embeds',
              'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313',
+            'md5': 'ceae8dced5c14a1c1ffcb7a32194cca5',
              'info_dict': {
                  'id': '9CsDKds0kvHI',
                  'ext': 'flv',
@@ -161,10 +164,9 @@ class YahooIE(InfoExtractor):
                  video_id = items[0]['id']
                  return self._get_info(video_id, display_id, webpage)
          # Look for NBCSports iframes
-        iframe_m = re.search(
-            r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage)
-        if iframe_m:
-            return self.url_result(iframe_m.group('url'), 'NBCSports')
+        nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
+        if nbc_sports_url:
+            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
  
          items_json = self._search_regex(
              r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
author	Yen Chi Hsuan <yan12125@gmail.com>
	Mon, 30 Mar 2015 18:47:18 +0000 (02:47 +0800)
committer	Yen Chi Hsuan <yan12125@gmail.com>
	Mon, 30 Mar 2015 18:47:18 +0000 (02:47 +0800)
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/nbc.py		patch \| blob \| history
youtube_dl/extractor/yahoo.py		patch \| blob \| history