[mlb] Extract more metadata and all formats, provide more tests

author Sergey M․ <dstftw@gmail.com>

Wed, 16 Jul 2014 13:40:28 +0000 (20:40 +0700)

committer Sergey M․ <dstftw@gmail.com>

Wed, 16 Jul 2014 13:40:28 +0000 (20:40 +0700)
author Sergey M․ <dstftw@gmail.com>
Wed, 16 Jul 2014 13:40:28 +0000 (20:40 +0700)
committer Sergey M․ <dstftw@gmail.com>
Wed, 16 Jul 2014 13:40:28 +0000 (20:40 +0700)
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index 14133c3158c1b7512d0c3f5f1aabe4159e02753c..c5961cab95de0a1884d307501913838bee8a65f9 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -170,7 +170,7 @@ from .metacafe import MetacafeIE
  from .metacritic import MetacriticIE
  from .mit import TechTVMITIE, MITIE, OCWMITIE
  from .mixcloud import MixcloudIE
  from .metacritic import MetacriticIE
  from .mit import TechTVMITIE, MITIE, OCWMITIE
  from .mixcloud import MixcloudIE
-from .mlb import MlbIE
+from .mlb import MLBIE
  from .mpora import MporaIE
  from .mofosex import MofosexIE
  from .mooshare import MooshareIE
  from .mpora import MporaIE
  from .mofosex import MofosexIE
  from .mooshare import MooshareIE
diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py

index 61ba588438676e5a1e053a4200513b3c1b2c634e..18ab2c135217b8a57d3bd74b883d150b4da9507d 100644 (file)
--- a/youtube_dl/extractor/mlb.py
+++ b/youtube_dl/extractor/mlb.py
@@ -3,72 +3,100 @@ from __future__ import unicode_literals
  import re
  
  from .common import InfoExtractor
  import re
  
  from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    parse_iso8601,
+    find_xpath_attr,
+)
  
  
  
  
-class MlbIE(InfoExtractor):
-    _VALID_URL = r'http?://m\.mlb\.com/video/topic/[0-9]+/v(?P<id>n?\d+)/.*$'
-    _TEST = {
-        'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby',
-        'md5': u'd9c022c10d21f849f49c05ae12a8a7e9',
-        'info_dict': {
-            'id': '34496663',
-            'ext': 'mp4',
-            'format': 'mp4',
-            'description': "7/11/14: Giancarlo Stanton practices for the Home Run Derby prior to the game against the Mets",
-            'title': "Stanton prepares for Derby",
+class MLBIE(InfoExtractor):
+    _VALID_URL = r'http?://m\.mlb\.com/video/(?:topic/[\da-z_-]+/)?v(?P<id>n?\d+)'
+    _TESTS = [
+        {
+            'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby',
+            'md5': 'd9c022c10d21f849f49c05ae12a8a7e9',
+            'info_dict': {
+                'id': '34496663',
+                'ext': 'mp4',
+                'title': 'Stanton prepares for Derby',
+                'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57',
+                'duration': 46,
+                'timestamp': 1405105800,
+                'upload_date': '20140711',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
          },
          },
-    }
+        {
+            'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby',
+            'md5': '0e6e73d509321e142409b695eadd541f',
+            'info_dict': {
+                'id': '34578115',
+                'ext': 'mp4',
+                'title': 'Cespedes repeats as Derby champ',
+                'description': 'md5:08df253ce265d4cf6fb09f581fafad07',
+                'duration': 488,
+                'timestamp': 1405399936,
+                'upload_date': '20140715',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+        },
+        {
+            'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance',
+            'md5': 'b8fd237347b844365d74ea61d4245967',
+            'info_dict': {
+                'id': '34577915',
+                'ext': 'mp4',
+                'title': 'Bautista on Home Run Derby',
+                'description': 'md5:b80b34031143d0986dddc64a8839f0fb',
+                'duration': 52,
+                'timestamp': 1405390722,
+                'upload_date': '20140715',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+        },
+    ]
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('id')
  
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('id')
  
-        webpage = self._download_webpage(url, video_id)
+        detail = self._download_xml(
+            'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml'
+            % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id)
+
+        title = detail.find('./headline').text
+        description = detail.find('./big-blurb').text
+        duration = parse_duration(detail.find('./duration').text)
+        timestamp = parse_iso8601(detail.attrib['date'][:-5])
+
+        thumbnail = find_xpath_attr(
+            detail, './thumbnailScenarios/thumbnailScenario', 'type', '45').text
  
  
-        title = self._og_search_title(webpage, default=video_id)
-        description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)"/>', webpage, 'description', fatal=False)
-        thumbnail = self._html_search_regex(r'<meta itemprop="image" (?:content|value)="(.*?)" />', webpage, 'image', fatal=False)
+        formats = []
+        for media_url in detail.findall('./url'):
+            playback_scenario = media_url.attrib['playback_scenario']
+            fmt = {
+                'url': media_url.text,
+                'format_id': playback_scenario,
+            }
+            m = re.search(r'(?P<vbr>\d+)K_(?P<width>\d+)X(?P<height>\d+)', playback_scenario)
+            if m:
+                fmt.update({
+                    'vbr': int(m.group('vbr')) * 1000,
+                    'width': int(m.group('width')),
+                    'height': int(m.group('height')),
+                })
+            formats.append(fmt)
  
  
-        # use the video_id to find the Media detail XML
-        id_len = len(video_id)
-        _mediadetail_url = 'http://m.mlb.com/gen/multimedia/detail/'+video_id[id_len-3]+'/'+video_id[id_len-2]+'/'+video_id[id_len-1]+'/'+video_id+'.xml'
-        
-        mediadetails = self._download_xml(_mediadetail_url, video_id, "Downloading media detail...")
-        has1500K = 0
-        has1200K = 0
-        has600K = 0
-        # loop through the list of url's and only get the highest quality MP4 content
-        for element in mediadetails.findall('url'):
-            scenario = element.attrib['playback_scenario']
-            if scenario.startswith(u'FLASH'):
-                if scenario.startswith(u'FLASH_1800K'):
-                    video_url = element.text
-                    # 1800K is the current highest quality video on MLB.com
-                    break
-                else:
-                    if scenario.startswith(u'FLASH_1500K'):
-                        video_url = element.text
-                        has1500K = 1
-                    else:
-                        if (scenario.startswith(u'FLASH_1200K') and not has1500K):
-                            video_url = element.text
-                            has1200K = 1
-                        else:
-                            if (scenario.startswith(u'FLASH_600K') and not has1200K):
-                                video_url = element.text
-                                has600K = 1
-                            else:
-                                if (scenario.startswith(u'FLASH_300K') and not has600K):
-                                    video_url = element.text
+        self._sort_formats(formats)
  
          return {
              'id': video_id,
  
          return {
              'id': video_id,
-            'url': video_url,
-            'extractor': 'mlb',
-            'webpage_url': url,
              'title': title,
              'title': title,
-            'ext': 'mp4',
-            'format': 'mp4',
              'description': description,
              'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'formats': formats,
              'thumbnail': thumbnail,
          }
              'thumbnail': thumbnail,
          }
author	Sergey M․ <dstftw@gmail.com>
	Wed, 16 Jul 2014 13:40:28 +0000 (20:40 +0700)
committer	Sergey M․ <dstftw@gmail.com>
	Wed, 16 Jul 2014 13:40:28 +0000 (20:40 +0700)
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/mlb.py		patch \| blob \| history