[youtube] Add support for yourepeat.com URLs (Closes #2397)

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 8816f4f801b60eba8fed2042a2f3a7031c7a08c6..02c5ede745beeb97fb71fe57abab7b963db2266f 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -34,13 +34,14 @@ from ..utils import (
      unified_strdate,
      orderedSet,
      write_json_file,
+    uppercase_escape,
  )
  
  class YoutubeBaseInfoExtractor(InfoExtractor):
      """Provide base functions for Youtube extractors"""
      _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
      _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
-    _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
+    _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
      _NETRC_MACHINE = 'youtube'
      # If True it will raise an error if no login info is provided
      _LOGIN_REQUIRED = False
@@ -111,7 +112,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
              'next_url': '/',
              'action_confirm': 'Confirm',
          }
-        req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
+        req = compat_urllib_request.Request(self._AGE_URL,
+            compat_urllib_parse.urlencode(age_form).encode('ascii'))
  
          self._download_webpage(
              req, None,
@@ -135,14 +137,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                           (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
                           (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
                              (?:www\.)?deturl\.com/www\.youtube\.com/|
-                            (?:www\.)?pwnyoutube\.com|
+                            (?:www\.)?pwnyoutube\.com/|
+                            (?:www\.)?yourepeat\.com/|
                              tube\.majestyc\.net/|
                              youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
                           (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
                           (?:                                                  # the various things that can precede the ID:
                               (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
                               |(?:                                             # or the v= param in all its forms
-                                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)?    # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+                                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
                                   (?:\?|\#!?)                                  # the params delimiter ? or # or #!
                                   (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
                                   v=
@@ -207,6 +210,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
  
          # Dash webm
+        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+        '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+        '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
          '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
          '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
          '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
@@ -495,7 +504,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  return a % b
  
              m = re.match(
-                r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
+                r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
              if m:
                  fname = m.group('func')
                  if fname not in functions:
@@ -1008,7 +1017,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
      def _get_available_subtitles(self, video_id, webpage):
          try:
              sub_list = self._download_webpage(
-                'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
+                'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
                  video_id, note=False)
          except ExtractorError as err:
              self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
@@ -1024,7 +1033,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
                  'name': unescapeHTML(l[0]).encode('utf-8'),
              })
-            url = u'http://www.youtube.com/api/timedtext?' + params
+            url = u'https://www.youtube.com/api/timedtext?' + params
              sub_lang_list[lang] = url
          if not sub_lang_list:
              self._downloader.report_warning(u'video doesn\'t have subtitles')
@@ -1078,8 +1087,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              self._downloader.report_warning(err_msg)
              return {}
  
-    def _extract_id(self, url):
-        mobj = re.match(self._VALID_URL, url, re.VERBOSE)
+    @classmethod
+    def extract_id(cls, url):
+        mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
          if mobj is None:
              raise ExtractorError(u'Invalid URL: %s' % url)
          video_id = mobj.group(2)
@@ -1108,7 +1118,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          mobj = re.search(self._NEXT_URL_RE, url)
          if mobj:
              url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
-        video_id = self._extract_id(url)
+        video_id = self.extract_id(url)
  
          # Get video webpage
          url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
@@ -1415,7 +1425,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
  
  class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
      IE_DESC = u'YouTube.com playlists'
-    _VALID_URL = r"""(?:
+    _VALID_URL = r"""(?x)(?:
                          (?:https?://)?
                          (?:\w+\.)?
                          youtube\.com/
@@ -1424,7 +1434,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                             \? (?:.*?&)*? (?:p|a|list)=
                          |  p/
                          )
-                        ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
+                        (
+                            (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
+                            # Top tracks, they can also include dots 
+                            |(?:MC)[\w\.]*
+                        )
                          .*
                       |
                          ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
@@ -1434,11 +1448,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
      _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
      IE_NAME = u'youtube:playlist'
  
-    @classmethod
-    def suitable(cls, url):
-        """Receives a URL and returns True if suitable for this IE."""
-        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
-
      def _real_initialize(self):
          self._login()
  
@@ -1462,7 +1471,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
  
      def _real_extract(self, url):
          # Extract playlist id
-        mobj = re.match(self._VALID_URL, url, re.VERBOSE)
+        mobj = re.match(self._VALID_URL, url)
          if mobj is None:
              raise ExtractorError(u'Invalid URL: %s' % url)
          playlist_id = mobj.group(1) or mobj.group(2)
@@ -1523,7 +1532,7 @@ class YoutubeTopListIE(YoutubePlaylistIE):
          channel = mobj.group('chann')
          title = mobj.group('title')
          query = compat_urllib_parse.urlencode({'title': title})
-        playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
+        playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
          channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
          link = self._html_search_regex(playlist_re, channel_page, u'list')
          url = compat_urlparse.urljoin('https://www.youtube.com/', link)
@@ -1548,7 +1557,7 @@ class YoutubeChannelIE(InfoExtractor):
      IE_DESC = u'YouTube.com channels'
      _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
      _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
-    _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
+    _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
      IE_NAME = u'youtube:channel'
  
      def extract_videos_from_page(self, page):
@@ -1583,11 +1592,10 @@ class YoutubeChannelIE(InfoExtractor):
              # Download all channel pages using the json-based channel_ajax query
              for pagenum in itertools.count(1):
                  url = self._MORE_PAGES_URL % (pagenum, channel_id)
-                page = self._download_webpage(url, channel_id,
-                                              u'Downloading page #%s' % pagenum)
-    
-                page = json.loads(page)
-    
+                page = self._download_json(
+                    url, channel_id, note=u'Downloading page #%s' % pagenum,
+                    transform_source=uppercase_escape)
+
                  ids_in_page = self.extract_videos_from_page(page['content_html'])
                  video_ids.extend(ids_in_page)
      
@@ -1604,9 +1612,9 @@ class YoutubeChannelIE(InfoExtractor):
  class YoutubeUserIE(InfoExtractor):
      IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
      _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
-    _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
+    _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
      _GDATA_PAGE_SIZE = 50
-    _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
+    _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
      IE_NAME = u'youtube:user'
  
      @classmethod
@@ -1655,7 +1663,7 @@ class YoutubeUserIE(InfoExtractor):
                      '_type': 'url',
                      'url': video_id,
                      'ie_key': 'Youtube',
-                    'id': 'video_id',
+                    'id': video_id,
                      'title': title,
                  }
          url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
@@ -1687,7 +1695,8 @@ class YoutubeSearchIE(SearchInfoExtractor):
              api_response = data['data']
  
              if 'items' not in api_response:
-                raise ExtractorError(u'[youtube] No video results')
+                raise ExtractorError(
+                    u'[youtube] No video results', expected=True)
  
              new_ids = list(video['id'] for video in api_response['items'])
              video_ids += new_ids
@@ -1737,7 +1746,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
          action = 'action_load_system_feed'
          if self._PERSONAL_FEED:
              action = 'action_load_personal_feed'
-        return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
+        return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
  
      @property
      def IE_NAME(self):
@@ -1807,7 +1816,7 @@ class YoutubeTruncatedURLIE(InfoExtractor):
      IE_NAME = 'youtube:truncated_url'
      IE_DESC = False  # Do not list
      _VALID_URL = r'''(?x)
-        (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
+        (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
          (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
      '''