X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyahoo.py;h=f9afbdbab611e233c7f7014ae7d66e996f2b7c31;hb=b407e173e44041b1a92fb61e316f92d19834a40a;hp=43776d1e6b30d3f413c2490b6c7c6161f4cf1efa;hpb=a2a4d5fa313d5244d24fa70d5db91971a7583d79;p=youtube-dl

diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 43776d1e6..f9afbdbab 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -15,6 +15,7 @@ from ..utils import (
     unescapeHTML,
     ExtractorError,
     int_or_none,
+    mimetype2ext,
 )
 
 from .nbc import NBCSportsVPlayerIE
@@ -22,7 +23,7 @@ from .nbc import NBCSportsVPlayerIE
 
 class YahooIE(InfoExtractor):
     IE_DESC = 'Yahoo screen and movies'
-    _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+?)-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
+    _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+)?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
     _TESTS = [
         {
             'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
@@ -134,19 +135,21 @@ class YahooIE(InfoExtractor):
         }, {
             'note': 'NBC Sports embeds',
             'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313',
-            'md5': 'ceae8dced5c14a1c1ffcb7a32194cca5',
             'info_dict': {
                 'id': '9CsDKds0kvHI',
                 'ext': 'flv',
                 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
                 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
             }
+        }, {
+            'url': 'https://tw.news.yahoo.com/-100120367.html',
+            'only_matching': True,
         }
     ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('display_id')
+        display_id = mobj.group('display_id') or self._match_id(url)
         page_id = mobj.group('id')
         url = mobj.group('url')
         host = mobj.group('host')
@@ -234,6 +237,22 @@ class YahooIE(InfoExtractor):
 
         self._sort_formats(formats)
 
+        closed_captions = self._html_search_regex(
+            r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions',
+            default='[]')
+
+        cc_json = self._parse_json(closed_captions, video_id, fatal=False)
+        subtitles = {}
+        if cc_json:
+            for closed_caption in cc_json:
+                lang = closed_caption['lang']
+                if lang not in subtitles:
+                    subtitles[lang] = []
+                subtitles[lang].append({
+                    'url': closed_caption['url'],
+                    'ext': mimetype2ext(closed_caption['content_type']),
+                })
+
         return {
             'id': video_id,
             'display_id': display_id,
@@ -242,6 +261,7 @@ class YahooIE(InfoExtractor):
             'description': clean_html(meta['description']),
             'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
             'duration': int_or_none(meta.get('duration')),
+            'subtitles': subtitles,
         }