[r7] Fix extraction and add support for articles (Closes #9826)
authorSergey M․ <dstftw@gmail.com>
Sat, 18 Jun 2016 19:25:34 +0000 (02:25 +0700)
committerSergey M․ <dstftw@gmail.com>
Sat, 18 Jun 2016 19:25:34 +0000 (02:25 +0700)
youtube_dl/extractor/extractors.py
youtube_dl/extractor/r7.py

index 2ff8676510ec17945c7f70d42208d0be49011616..b1b04f2fc1d79fc3338f227917ac29392071bc0b 100644 (file)
@@ -631,7 +631,10 @@ from .qqmusic import (
     QQMusicToplistIE,
     QQMusicPlaylistIE,
 )
-from .r7 import R7IE
+from .r7 import (
+    R7IE,
+    R7ArticleIE,
+)
 from .radiocanada import (
     RadioCanadaIE,
     RadioCanadaAudioVideoIE,
index 976c8feec657f8de731d3ffadfa09189ed1628cf..069dbfaed0638e396d024ec81d5142d18f9ad90f 100644 (file)
@@ -2,22 +2,19 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import (
-    js_to_json,
-    unescapeHTML,
-    int_or_none,
-)
+from ..utils import int_or_none
 
 
 class R7IE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://
+    _VALID_URL = r'''(?x)
+                        https?://
                         (?:
                             (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/|
                             noticias\.r7\.com(?:/[^/]+)+/[^/]+-|
                             player\.r7\.com/video/i/
                         )
                         (?P<id>[\da-f]{24})
-                        '''
+                    '''
     _TESTS = [{
         'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html',
         'md5': '403c4e393617e8e8ddc748978ee8efde',
@@ -25,6 +22,7 @@ class R7IE(InfoExtractor):
             'id': '54e7050b0cf2ff57e0279389',
             'ext': 'mp4',
             'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"',
+            'description': 'md5:01812008664be76a6479aa58ec865b72',
             'thumbnail': 're:^https?://.*\.jpg$',
             'duration': 98,
             'like_count': int,
@@ -44,45 +42,72 @@ class R7IE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(
-            'http://player.r7.com/video/i/%s' % video_id, video_id)
+        video = self._download_json(
+            'http://player-api.r7.com/video/i/%s' % video_id, video_id)
 
-        item = self._parse_json(js_to_json(self._search_regex(
-            r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id)
-
-        title = unescapeHTML(item['title'])
-        thumbnail = item.get('init', {}).get('thumbUri')
-        duration = None
-
-        statistics = item.get('statistics', {})
-        like_count = int_or_none(statistics.get('likes'))
-        view_count = int_or_none(statistics.get('views'))
+        title = video['title']
 
         formats = []
-        for format_key, format_dict in item['playlist'][0].items():
-            src = format_dict.get('src')
-            if not src:
-                continue
-            format_id = format_dict.get('format') or format_key
-            if duration is None:
-                duration = format_dict.get('duration')
-            if '.f4m' in src:
-                formats.extend(self._extract_f4m_formats(src, video_id, preference=-1))
-            elif src.endswith('.m3u8'):
-                formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2))
-            else:
-                formats.append({
-                    'url': src,
-                    'format_id': format_id,
-                })
+        media_url_hls = video.get('media_url_hls')
+        if media_url_hls:
+            formats.extend(self._extract_m3u8_formats(
+                media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False))
+        media_url = video.get('media_url')
+        if media_url:
+            f = {
+                'url': media_url,
+                'format_id': 'http',
+            }
+            # m3u8 format always matches the http format, let's copy metadata from
+            # one to another
+            m3u8_formats = list(filter(
+                lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+                formats))
+            if len(m3u8_formats) == 1:
+                f_copy = m3u8_formats[0].copy()
+                f_copy.update(f)
+                f_copy['protocol'] = 'http'
+                f = f_copy
+            formats.append(f)
         self._sort_formats(formats)
 
+        description = video.get('description')
+        thumbnail = video.get('thumb')
+        duration = int_or_none(video.get('media_duration'))
+        like_count = int_or_none(video.get('likes'))
+        view_count = int_or_none(video.get('views'))
+
         return {
             'id': video_id,
             'title': title,
+            'description': description,
             'thumbnail': thumbnail,
             'duration': duration,
             'like_count': like_count,
             'view_count': view_count,
             'formats': formats,
         }
+
+
+class R7ArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015',
+        'only_matching': True,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})',
+            webpage, 'video id')
+
+        return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key())