[youtube:truncated_url] Move test to extractor

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index f0d1f6400cb1514b205022a2b89d8e568d263c53..bf0fbc924be8e9355569acd5426f639523aacdb1 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -223,6 +223,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
          '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
          '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
  
          # Dash webm audio
          '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
@@ -439,7 +441,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
  
      def _parse_sig_js(self, jscode):
          funcname = self._search_regex(
-            r'signature=([a-zA-Z]+)', jscode,
+            r'signature=([$a-zA-Z]+)', jscode,
               u'Initial JS player signature function name')
  
          jsi = JSInterpreter(jscode)
@@ -1140,7 +1142,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
          if mobj is None:
              mobj = re.search(
-                r'(?s)id="watch-uploader-info".*?>.*?Published on (.*?)</strong>',
+                r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
                  video_webpage)
          if mobj is not None:
              upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
@@ -1385,13 +1387,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                          |  p/
                          )
                          (
-                            (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
+                            (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
                              # Top tracks, they can also include dots 
                              |(?:MC)[\w\.]*
                          )
                          .*
                       |
-                        ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
+                        ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
                       )"""
      _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
      _MORE_PAGES_INDICATOR = r'data-link-type="next"'
@@ -1414,11 +1416,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
          title_span = (search_title('playlist-title') or
              search_title('title long-title') or search_title('title'))
          title = clean_html(title_span)
-        video_re = r'''(?x)data-video-username="(.*?)".*?
+        video_re = r'''(?x)data-video-username=".*?".*?
                         href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
-        matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
-        # Some of the videos may have been deleted, their username field is empty
-        ids = [video_id for (username, video_id) in matches if username]
+        ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
          url_results = self._ids_to_results(ids)
  
          return self.playlist_result(url_results, playlist_id, title)
@@ -1775,9 +1775,12 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
              feed_entries.extend(
                  self.url_result(video_id, 'Youtube', video_id=video_id)
                  for video_id in ids)
-            if info['paging'] is None:
+            mobj = re.search(
+                r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
+                feed_html)
+            if mobj is None:
                  break
-            paging = info['paging']
+            paging = mobj.group('paging')
          return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
  
  class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
@@ -1822,10 +1825,21 @@ class YoutubeTruncatedURLIE(InfoExtractor):
      IE_NAME = 'youtube:truncated_url'
      IE_DESC = False  # Do not list
      _VALID_URL = r'''(?x)
-        (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
+        (?:https?://)?[^/]+/watch\?(?:
+            feature=[a-z_]+|
+            annotation_id=annotation_[^&]+
+        )?$|
          (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
      '''
  
+    _TESTS = [{
+        'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.youtube.com/watch?',
+        'only_matching': True,
+    }]
+
      def _real_extract(self, url):
          raise ExtractorError(
              u'Did you forget to quote the URL? Remember that & is a meta '