[smotri] Remove non relevant test

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 9096a29756ca6e1a66ecd442a92977fa1b999b31..d9240ff02b5f6d5ba19350044022ae535443a599 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -32,6 +32,7 @@ from ..utils import (
      unescapeHTML,
      unified_strdate,
      uppercase_escape,
+    ISO3166Utils,
  )
  
  
@@ -234,6 +235,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          '44': {'ext': 'webm', 'width': 854, 'height': 480},
          '45': {'ext': 'webm', 'width': 1280, 'height': 720},
          '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
+        '59': {'ext': 'mp4', 'width': 854, 'height': 480},
+        '78': {'ext': 'mp4', 'width': 854, 'height': 480},
  
  
          # 3d videos
@@ -785,7 +788,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              s = mobj.group(1)
              dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
              return '/signature/%s' % dec_s
-        dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
+        dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
          dash_doc = self._download_xml(
              dash_manifest_url, video_id,
              note='Downloading DASH manifest',
@@ -901,6 +904,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                          break
          if 'token' not in video_info:
              if 'reason' in video_info:
+                if 'The uploader has not made this video available in your country.' in video_info['reason']:
+                    regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
+                    if regions_allowed is not None:
+                        raise ExtractorError('YouTube said: This video is available in %s only' % (
+                            ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
+                            expected=True)
                  raise ExtractorError(
                      'YouTube said: %s' % video_info['reason'][0],
                      expected=True, video_id=video_id)
@@ -1126,12 +1135,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                      self.report_warning(
                          'Skipping DASH manifest: %r' % e, video_id)
                  else:
-                    # Hide the formats we found through non-DASH
+                    # Remove the formats we found through non-DASH, they
+                    # contain less info and it can be wrong, because we use
+                    # fixed values (for example the resolution). See
+                    # https://github.com/rg3/youtube-dl/issues/5774 for an
+                    # example.
                      dash_keys = set(df['format_id'] for df in dash_formats)
-                    for f in formats:
-                        if f['format_id'] in dash_keys:
-                            f['format_id'] = 'nondash-%s' % f['format_id']
-                            f['preference'] = f.get('preference', 0) - 10000
+                    formats = [f for f in formats if f['format_id'] not in dash_keys]
                      formats.extend(dash_formats)
  
          # Check for malformed aspect ratio
@@ -1289,7 +1299,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
      def _extract_playlist(self, playlist_id):
          url = self._TEMPLATE_URL % playlist_id
          page = self._download_webpage(url, playlist_id)
-        more_widget_html = content_html = page
  
          for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
              match = match.strip()
@@ -1309,36 +1318,36 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                  self.report_warning('Youtube gives an alert message: ' + match)
  
          # Extract the video ids from the playlist pages
-        ids = []
-
-        for page_num in itertools.count(1):
-            matches = re.finditer(self._VIDEO_RE, content_html)
-            # We remove the duplicates and the link with index 0
-            # (it's not the first video of the playlist)
-            new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
-            ids.extend(new_ids)
-
-            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
-            if not mobj:
-                break
+        def _entries():
+            more_widget_html = content_html = page
+            for page_num in itertools.count(1):
+                matches = re.finditer(self._VIDEO_RE, content_html)
+                # We remove the duplicates and the link with index 0
+                # (it's not the first video of the playlist)
+                new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
+                for vid_id in new_ids:
+                    yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
+
+                mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+                if not mobj:
+                    break
  
-            more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), playlist_id,
-                'Downloading page #%s' % page_num,
-                transform_source=uppercase_escape)
-            content_html = more['content_html']
-            if not content_html.strip():
-                # Some webpages show a "Load more" button but they don't
-                # have more videos
-                break
-            more_widget_html = more['load_more_widget_html']
+                more = self._download_json(
+                    'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+                    'Downloading page #%s' % page_num,
+                    transform_source=uppercase_escape)
+                content_html = more['content_html']
+                if not content_html.strip():
+                    # Some webpages show a "Load more" button but they don't
+                    # have more videos
+                    break
+                more_widget_html = more['load_more_widget_html']
  
          playlist_title = self._html_search_regex(
              r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
              page, 'title')
  
-        url_results = self._ids_to_results(ids)
-        return self.playlist_result(url_results, playlist_id, playlist_title)
+        return self.playlist_result(_entries(), playlist_id, playlist_title)
  
      def _real_extract(self, url):
          # Extract playlist id
@@ -1398,6 +1407,24 @@ class YoutubeChannelIE(InfoExtractor):
          channel_id = self._match_id(url)
  
          url = self._TEMPLATE_URL % channel_id
+
+        # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+        # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+        # otherwise fallback on channel by page extraction
+        channel_page = self._download_webpage(
+            url + '?view=57', channel_id,
+            'Downloading channel page', fatal=False)
+        channel_playlist_id = self._html_search_meta(
+            'channelId', channel_page, 'channel id', default=None)
+        if not channel_playlist_id:
+            channel_playlist_id = self._search_regex(
+                r'data-channel-external-id="([^"]+)"',
+                channel_page, 'channel id', default=None)
+        if channel_playlist_id and channel_playlist_id.startswith('UC'):
+            playlist_id = 'UU' + channel_playlist_id[2:]
+            return self.url_result(
+                compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
+
          channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
          autogenerated = re.search(r'''(?x)
                  class="[^"]*?(?:
@@ -1486,7 +1513,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
  
          for pagenum in itertools.count(1):
              url_query = {
-                'search_query': query,
+                'search_query': query.encode('utf-8'),
                  'page': pagenum,
                  'spf': 'navigate',
              }
@@ -1621,10 +1648,16 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
          # for the video ids doesn't contain an index
          ids = []
          more_widget_html = content_html = page
-
          for page_num in itertools.count(1):
              matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
-            new_ids = orderedSet(matches)
+
+            # 'recommended' feed has infinite 'load more' and each new portion spins
+            # the same videos in (sometimes) slightly different order, so we'll check
+            # for unicity and break when portion has no new videos
+            new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+            if not new_ids:
+                break
+
              ids.extend(new_ids)
  
              mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)