[youtube] fix extraction for embed restricted live streams(fixes #16433)

[youtube-dl] / youtube_dl / extractor / tvplay.py
diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py

index 0c072a6aec88a8d05c44fa24db311cdb7dcd7ce0..e09b5f804d897954f4488344d27beaa8a7a2eea6 100644 (file)
--- a/youtube_dl/extractor/tvplay.py
+++ b/youtube_dl/extractor/tvplay.py
@@ -5,29 +5,43 @@ import re
  
  from .common import InfoExtractor
  from ..compat import (
+    compat_HTTPError,
      compat_str,
      compat_urlparse,
  )
  from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
      parse_iso8601,
      qualities,
-    determine_ext,
+    smuggle_url,
+    try_get,
+    unsmuggle_url,
      update_url_query,
-    int_or_none,
  )
  
  
  class TVPlayIE(InfoExtractor):
-    IE_DESC = 'TV3Play and related services'
-    _VALID_URL = r'''(?x)https?://(?:www\.)?
-        (?:tvplay(?:\.skaties)?\.lv/parraides|
-           (?:tv3play|play\.tv3)\.lt/programos|
-           tv3play(?:\.tv3)?\.ee/sisu|
-           tv(?:3|6|8|10)play\.se/program|
-           (?:(?:tv3play|viasat4play|tv6play)\.no|tv3play\.dk)/programmer|
-           play\.novatv\.bg/programi
-        )/[^/]+/(?P<id>\d+)
-        '''
+    IE_NAME = 'mtg'
+    IE_DESC = 'MTG services'
+    _VALID_URL = r'''(?x)
+                    (?:
+                        mtg:|
+                        https?://
+                            (?:www\.)?
+                            (?:
+                                tvplay(?:\.skaties)?\.lv/parraides|
+                                (?:tv3play|play\.tv3)\.lt/programos|
+                                tv3play(?:\.tv3)?\.ee/sisu|
+                                (?:tv(?:3|6|8|10)play|viafree)\.se/program|
+                                (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer|
+                                play\.novatv\.bg/programi
+                            )
+                            /(?:[^/]+/)+
+                        )
+                        (?P<id>\d+)
+                    '''
      _TESTS = [
          {
              'url': 'http://www.tvplay.lv/parraides/vinas-melo-labak/418113?autostart=true',
@@ -37,6 +51,9 @@ class TVPlayIE(InfoExtractor):
                  'ext': 'mp4',
                  'title': 'Kādi ir īri? - Viņas melo labāk',
                  'description': 'Baiba apsmej īrus, kādi tie ir un ko viņi dara.',
+                'series': 'Viņas melo labāk',
+                'season': '2.sezona',
+                'season_number': 2,
                  'duration': 25,
                  'timestamp': 1406097056,
                  'upload_date': '20140723',
@@ -49,6 +66,10 @@ class TVPlayIE(InfoExtractor):
                  'ext': 'flv',
                  'title': 'Moterys meluoja geriau',
                  'description': 'md5:9aec0fc68e2cbc992d2a140bd41fa89e',
+                'series': 'Moterys meluoja geriau',
+                'episode_number': 47,
+                'season': '1 sezonas',
+                'season_number': 1,
                  'duration': 1330,
                  'timestamp': 1403769181,
                  'upload_date': '20140626',
@@ -185,26 +206,51 @@ class TVPlayIE(InfoExtractor):
              'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true',
              'only_matching': True,
          },
+        {
+            # views is null
+            'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183',
+            'only_matching': True,
+        },
          {
              'url': 'http://tv3play.tv3.ee/sisu/kodu-keset-linna/238551?autostart=true',
              'only_matching': True,
+        },
+        {
+            'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869',
+            'only_matching': True,
+        },
+        {
+            'url': 'mtg:418113',
+            'only_matching': True,
          }
      ]
  
      def _real_extract(self, url):
-        video_id = self._match_id(url)
+        url, smuggled_data = unsmuggle_url(url, {})
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+        })
  
+        video_id = self._match_id(url)
+        geo_country = self._search_regex(
+            r'https?://[^/]+\.([a-z]{2})', url,
+            'geo country', default=None)
+        if geo_country:
+            self._initialize_geo_bypass({'countries': [geo_country.upper()]})
          video = self._download_json(
-            'http://playapi.mtgx.tv/v1/videos/%s' % video_id, video_id, 'Downloading video JSON')
+            'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON')
  
          title = video['title']
  
-        if video.get('is_geo_blocked'):
-            self.report_warning(
-                'This content might not be available in your country due to copyright reasons')
-
-        streams = self._download_json(
-            'http://playapi.mtgx.tv/v1/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON')
+        try:
+            streams = self._download_json(
+                'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id,
+                video_id, 'Downloading streams JSON')
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                msg = self._parse_json(e.cause.read().decode('utf-8'), video_id)
+                raise ExtractorError(msg['msg'], expected=True)
+            raise
  
          quality = qualities(['hls', 'medium', 'high'])
          formats = []
@@ -229,6 +275,8 @@ class TVPlayIE(InfoExtractor):
                      'ext': ext,
                  }
                  if video_url.startswith('rtmp'):
+                    if smuggled_data.get('skip_rtmp'):
+                        continue
                      m = re.search(
                          r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url)
                      if not m:
@@ -244,6 +292,11 @@ class TVPlayIE(InfoExtractor):
                          'url': video_url,
                      })
                  formats.append(fmt)
+
+        if not formats and video.get('is_geo_blocked'):
+            self.raise_geo_restricted(
+                'This content might not be available in your country due to copyright reasons')
+
          self._sort_formats(formats)
  
          # TODO: webvtt in m3u8
@@ -257,14 +310,138 @@ class TVPlayIE(InfoExtractor):
                  'url': sami_path,
              }]
  
+        series = video.get('format_title')
+        episode_number = int_or_none(video.get('format_position', {}).get('episode'))
+        season = video.get('_embedded', {}).get('season', {}).get('title')
+        season_number = int_or_none(video.get('format_position', {}).get('season'))
+
          return {
              'id': video_id,
              'title': title,
              'description': video.get('description'),
+            'series': series,
+            'episode_number': episode_number,
+            'season': season,
+            'season_number': season_number,
              'duration': int_or_none(video.get('duration')),
              'timestamp': parse_iso8601(video.get('created_at')),
-            'view_count': int_or_none(video.get('views', {}).get('total')),
+            'view_count': try_get(video, lambda x: x['views']['total'], int),
              'age_limit': int_or_none(video.get('age_limit', 0)),
              'formats': formats,
              'subtitles': subtitles,
          }
+
+
+class ViafreeIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?
+                        viafree\.
+                        (?:
+                            (?:dk|no)/programmer|
+                            se/program
+                        )
+                        /(?:[^/]+/)+(?P<id>[^/?#&]+)
+                    '''
+    _TESTS = [{
+        'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2',
+        'info_dict': {
+            'id': '395375',
+            'ext': 'mp4',
+            'title': 'Husräddarna S02E02',
+            'description': 'md5:4db5c933e37db629b5a2f75dfb34829e',
+            'series': 'Husräddarna',
+            'season': 'Säsong 2',
+            'season_number': 2,
+            'duration': 2576,
+            'timestamp': 1400596321,
+            'upload_date': '20140520',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [TVPlayIE.ie_key()],
+    }, {
+        # with relatedClips
+        'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1',
+        'info_dict': {
+            'id': '758770',
+            'ext': 'mp4',
+            'title': 'Sommaren med YouTube-stjärnorna S01E01',
+            'description': 'md5:2bc69dce2c4bb48391e858539bbb0e3f',
+            'series': 'Sommaren med YouTube-stjärnorna',
+            'season': 'Säsong 1',
+            'season_number': 1,
+            'duration': 1326,
+            'timestamp': 1470905572,
+            'upload_date': '20160811',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [TVPlayIE.ie_key()],
+    }, {
+        # Different og:image URL schema
+        'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if TVPlayIE.suitable(url) else super(ViafreeIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        data = self._parse_json(
+            self._search_regex(
+                r'(?s)window\.App\s*=\s*({.+?})\s*;\s*</script',
+                webpage, 'data', default='{}'),
+            video_id, transform_source=lambda x: re.sub(
+                r'(?s)function\s+[a-zA-Z_][\da-zA-Z_]*\s*\([^)]*\)\s*{[^}]*}\s*',
+                'null', x), fatal=False)
+
+        video_id = None
+
+        if data:
+            video_id = try_get(
+                data, lambda x: x['context']['dispatcher']['stores'][
+                    'ContentPageProgramStore']['currentVideo']['id'],
+                compat_str)
+
+        # Fallback #1 (extract from og:image URL schema)
+        if not video_id:
+            thumbnail = self._og_search_thumbnail(webpage, default=None)
+            if thumbnail:
+                video_id = self._search_regex(
+                    # Patterns seen:
+                    #  http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/inbox/765166/a2e95e5f1d735bab9f309fa345cc3f25.jpg
+                    #  http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/seasons/15204/758770/4a5ba509ca8bc043e1ebd1a76131cdf2.jpg
+                    r'https?://[^/]+/imagecache/(?:[^/]+/)+(\d{6,})/',
+                    thumbnail, 'video id', default=None)
+
+        # Fallback #2. Extract from raw JSON string.
+        # May extract wrong video id if relatedClips is present.
+        if not video_id:
+            video_id = self._search_regex(
+                r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](\d{6,})',
+                webpage, 'video id')
+
+        return self.url_result(
+            smuggle_url(
+                'mtg:%s' % video_id,
+                {
+                    'geo_countries': [
+                        compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]],
+                    # rtmp host mtgfs.fplive.net for viafree is unresolvable
+                    'skip_rtmp': True,
+                }),
+            ie=TVPlayIE.ie_key(), video_id=video_id)