_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import traceback
  11
  12 from .common import InfoExtractor, SearchInfoExtractor
  13 from .subtitles import SubtitlesInfoExtractor
  14 from ..jsinterp import JSInterpreter
  15 from ..swfinterp import SWFInterpreter
  16 from ..utils import (
  17     compat_chr,
  18     compat_parse_qs,
  19     compat_urllib_parse,
  20     compat_urllib_request,
  21     compat_urlparse,
  22     compat_str,
  23
  24     clean_html,
  25     get_element_by_id,
  26     get_element_by_attribute,
  27     ExtractorError,
  28     int_or_none,
  29     OnDemandPagedList,
  30     unescapeHTML,
  31     unified_strdate,
  32     orderedSet,
  33     uppercase_escape,
  34 )
  35
  36
  37 class YoutubeBaseInfoExtractor(InfoExtractor):
  38     """Provide base functions for Youtube extractors"""
  39     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  40     _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
  41     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  42     _NETRC_MACHINE = 'youtube'
  43     # If True it will raise an error if no login info is provided
  44     _LOGIN_REQUIRED = False
  45
  46     def _set_language(self):
  47         return bool(self._download_webpage(
  48             self._LANG_URL, None,
  49             note='Setting language', errnote='unable to set language',
  50             fatal=False))
  51
  52     def _login(self):
  53         """
  54         Attempt to log in to YouTube.
  55         True is returned if successful or skipped.
  56         False is returned if login failed.
  57
  58         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  59         """
  60         (username, password) = self._get_login_info()
  61         # No authentication to be performed
  62         if username is None:
  63             if self._LOGIN_REQUIRED:
  64                 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  65             return True
  66
  67         login_page = self._download_webpage(
  68             self._LOGIN_URL, None,
  69             note='Downloading login page',
  70             errnote='unable to fetch login page', fatal=False)
  71         if login_page is False:
  72             return
  73
  74         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  75                                   login_page, 'Login GALX parameter')
  76
  77         # Log in
  78         login_form_strs = {
  79             'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  80             'Email': username,
  81             'GALX': galx,
  82             'Passwd': password,
  83
  84             'PersistentCookie': 'yes',
  85             '_utf8': '霱',
  86             'bgresponse': 'js_disabled',
  87             'checkConnection': '',
  88             'checkedDomains': 'youtube',
  89             'dnConn': '',
  90             'pstMsg': '0',
  91             'rmShown': '1',
  92             'secTok': '',
  93             'signIn': 'Sign in',
  94             'timeStmp': '',
  95             'service': 'youtube',
  96             'uilel': '3',
  97             'hl': 'en_US',
  98         }
  99
 100         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 101         # chokes on unicode
 102         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
 103         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 104
 105         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 106         login_results = self._download_webpage(
 107             req, None,
 108             note='Logging in', errnote='unable to log in', fatal=False)
 109         if login_results is False:
 110             return False
 111
 112         if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 113             raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 114
 115         # Two-Factor
 116         # TODO add SMS and phone call support - these require making a request and then prompting the user
 117
 118         if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
 119             tfa_code = self._get_tfa_info()
 120
 121             if tfa_code is None:
 122                 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
 123                 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 124                 return False
 125
 126             # Unlike the first login form, secTok and timeStmp are both required for the TFA form
 127
 128             match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 129             if match is None:
 130                 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
 131             secTok = match.group(1)
 132             match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 133             if match is None:
 134                 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
 135             timeStmp = match.group(1)
 136
 137             tfa_form_strs = {
 138                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 139                 'smsToken': '',
 140                 'smsUserPin': tfa_code,
 141                 'smsVerifyPin': 'Verify',
 142
 143                 'PersistentCookie': 'yes',
 144                 'checkConnection': '',
 145                 'checkedDomains': 'youtube',
 146                 'pstMsg': '1',
 147                 'secTok': secTok,
 148                 'timeStmp': timeStmp,
 149                 'service': 'youtube',
 150                 'hl': 'en_US',
 151             }
 152             tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
 153             tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 154
 155             tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 156             tfa_results = self._download_webpage(
 157                 tfa_req, None,
 158                 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
 159
 160             if tfa_results is False:
 161                 return False
 162
 163             if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
 164                 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
 165                 return False
 166             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 167                 self._downloader.report_warning('unable to log in - did the page structure change?')
 168                 return False
 169             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 170                 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 171                 return False
 172
 173         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 174             self._downloader.report_warning('unable to log in: bad username or password')
 175             return False
 176         return True
 177
 178     def _real_initialize(self):
 179         if self._downloader is None:
 180             return
 181         if self._get_login_info()[0] is not None:
 182             if not self._set_language():
 183                 return
 184         if not self._login():
 185             return
 186
 187
 188 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 189     IE_DESC = 'YouTube.com'
 190     _VALID_URL = r"""(?x)^
 191                      (
 192                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
 193                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 194                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 195                             (?:www\.)?pwnyoutube\.com/|
 196                             (?:www\.)?yourepeat\.com/|
 197                             tube\.majestyc\.net/|
 198                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 199                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 200                          (?:                                                  # the various things that can precede the ID:
 201                              (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
 202                              |(?:                                             # or the v= param in all its forms
 203                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 204                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 205                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 206                                  v=
 207                              )
 208                          ))
 209                          |youtu\.be/                                          # just youtu.be/xxxx
 210                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 211                          )
 212                      )?                                                       # all until now is optional -> you can pass the naked ID
 213                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 214                      (?!.*?&list=)                                            # combined list/video URLs are handled by the playlist IE
 215                      (?(1).+)?                                                # if we found the ID, everything can follow
 216                      $"""
 217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 218     _formats = {
 219         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 220         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 221         '13': {'ext': '3gp'},
 222         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 223         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 224         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 225         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 226         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 227         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 228         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 229         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 230         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 231         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 232         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 233         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 234
 235
 236         # 3d videos
 237         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 238         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 239         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 240         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 241         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 242         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 243         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 244
 245         # Apple HTTP Live Streaming
 246         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 247         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 248         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 249         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 250         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 251         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 252         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 253
 254         # DASH mp4 video
 255         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 256         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 257         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 258         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 259         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 260         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 261         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 262         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 263         '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 264         '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 265         '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
 266
 267         # Dash mp4 audio
 268         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 269         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 270         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 271
 272         # Dash webm
 273         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 274         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 275         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 276         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 277         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 278         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 279         '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
 280         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 281         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 282         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 283         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 284         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 285         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 286         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 287         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 288         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 289         '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 290         '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 291
 292         # Dash webm audio
 293         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
 294         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 295
 296         # Dash webm audio with opus inside
 297         '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
 298         '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
 299         '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
 300
 301         # RTMP (unnamed)
 302         '_rtmp': {'protocol': 'rtmp'},
 303     }
 304
 305     IE_NAME = 'youtube'
 306     _TESTS = [
 307         {
 308             'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
 309             'info_dict': {
 310                 'id': 'BaW_jenozKc',
 311                 'ext': 'mp4',
 312                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
 313                 'uploader': 'Philipp Hagemeister',
 314                 'uploader_id': 'phihag',
 315                 'upload_date': '20121002',
 316                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
 317                 'categories': ['Science & Technology'],
 318                 'like_count': int,
 319                 'dislike_count': int,
 320             }
 321         },
 322         {
 323             'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
 324             'note': 'Test generic use_cipher_signature video (#897)',
 325             'info_dict': {
 326                 'id': 'UxxajLWwzqY',
 327                 'ext': 'mp4',
 328                 'upload_date': '20120506',
 329                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
 330                 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
 331                 'uploader': 'Icona Pop',
 332                 'uploader_id': 'IconaPop',
 333             }
 334         },
 335         {
 336             'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
 337             'note': 'Test VEVO video with age protection (#956)',
 338             'info_dict': {
 339                 'id': '07FYdnEawAQ',
 340                 'ext': 'mp4',
 341                 'upload_date': '20130703',
 342                 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
 343                 'description': 'md5:64249768eec3bc4276236606ea996373',
 344                 'uploader': 'justintimberlakeVEVO',
 345                 'uploader_id': 'justintimberlakeVEVO',
 346             }
 347         },
 348         {
 349             'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
 350             'note': 'Embed-only video (#1746)',
 351             'info_dict': {
 352                 'id': 'yZIXLfi8CZQ',
 353                 'ext': 'mp4',
 354                 'upload_date': '20120608',
 355                 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
 356                 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
 357                 'uploader': 'SET India',
 358                 'uploader_id': 'setindia'
 359             }
 360         },
 361         {
 362             'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
 363             'note': '256k DASH audio (format 141) via DASH manifest',
 364             'info_dict': {
 365                 'id': 'a9LDPn-MO4I',
 366                 'ext': 'm4a',
 367                 'upload_date': '20121002',
 368                 'uploader_id': '8KVIDEO',
 369                 'description': '',
 370                 'uploader': '8KVIDEO',
 371                 'title': 'UHDTV TEST 8K VIDEO.mp4'
 372             },
 373             'params': {
 374                 'youtube_include_dash_manifest': True,
 375                 'format': '141',
 376             },
 377         },
 378         # DASH manifest with encrypted signature
 379         {
 380             'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 381             'info_dict': {
 382                 'id': 'IB3lcPjvWLA',
 383                 'ext': 'm4a',
 384                 'title': 'Afrojack - The Spark ft. Spree Wilson',
 385                 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
 386                 'uploader': 'AfrojackVEVO',
 387                 'uploader_id': 'AfrojackVEVO',
 388                 'upload_date': '20131011',
 389             },
 390             'params': {
 391                 'youtube_include_dash_manifest': True,
 392                 'format': '141',
 393             },
 394         },
 395         # Controversy video
 396         {
 397             'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
 398             'info_dict': {
 399                 'id': 'T4XJQO3qol8',
 400                 'ext': 'mp4',
 401                 'upload_date': '20100909',
 402                 'uploader': 'The Amazing Atheist',
 403                 'uploader_id': 'TheAmazingAtheist',
 404                 'title': 'Burning Everyone\'s Koran',
 405                 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
 406             }
 407         }
 408     ]
 409
 410     def __init__(self, *args, **kwargs):
 411         super(YoutubeIE, self).__init__(*args, **kwargs)
 412         self._player_cache = {}
 413
 414     def report_video_info_webpage_download(self, video_id):
 415         """Report attempt to download video info webpage."""
 416         self.to_screen('%s: Downloading video info webpage' % video_id)
 417
 418     def report_information_extraction(self, video_id):
 419         """Report attempt to extract video information."""
 420         self.to_screen('%s: Extracting video information' % video_id)
 421
 422     def report_unavailable_format(self, video_id, format):
 423         """Report extracted video URL."""
 424         self.to_screen('%s: Format %s not available' % (video_id, format))
 425
 426     def report_rtmp_download(self):
 427         """Indicate the download will use the RTMP protocol."""
 428         self.to_screen('RTMP download detected')
 429
 430     def _signature_cache_id(self, example_sig):
 431         """ Return a string representation of a signature """
 432         return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 433
 434     def _extract_signature_function(self, video_id, player_url, example_sig):
 435         id_m = re.match(
 436             r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
 437             player_url)
 438         if not id_m:
 439             raise ExtractorError('Cannot identify player %r' % player_url)
 440         player_type = id_m.group('ext')
 441         player_id = id_m.group('id')
 442
 443         # Read from filesystem cache
 444         func_id = '%s_%s_%s' % (
 445             player_type, player_id, self._signature_cache_id(example_sig))
 446         assert os.path.basename(func_id) == func_id
 447
 448         cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
 449         if cache_spec is not None:
 450             return lambda s: ''.join(s[i] for i in cache_spec)
 451
 452         if player_type == 'js':
 453             code = self._download_webpage(
 454                 player_url, video_id,
 455                 note='Downloading %s player %s' % (player_type, player_id),
 456                 errnote='Download of %s failed' % player_url)
 457             res = self._parse_sig_js(code)
 458         elif player_type == 'swf':
 459             urlh = self._request_webpage(
 460                 player_url, video_id,
 461                 note='Downloading %s player %s' % (player_type, player_id),
 462                 errnote='Download of %s failed' % player_url)
 463             code = urlh.read()
 464             res = self._parse_sig_swf(code)
 465         else:
 466             assert False, 'Invalid player type %r' % player_type
 467
 468         if cache_spec is None:
 469             test_string = ''.join(map(compat_chr, range(len(example_sig))))
 470             cache_res = res(test_string)
 471             cache_spec = [ord(c) for c in cache_res]
 472
 473         self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
 474         return res
 475
 476     def _print_sig_code(self, func, example_sig):
 477         def gen_sig_code(idxs):
 478             def _genslice(start, end, step):
 479                 starts = '' if start == 0 else str(start)
 480                 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
 481                 steps = '' if step == 1 else (':%d' % step)
 482                 return 's[%s%s%s]' % (starts, ends, steps)
 483
 484             step = None
 485             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 486                                     # set as soon as step is set
 487             for i, prev in zip(idxs[1:], idxs[:-1]):
 488                 if step is not None:
 489                     if i - prev == step:
 490                         continue
 491                     yield _genslice(start, prev, step)
 492                     step = None
 493                     continue
 494                 if i - prev in [-1, 1]:
 495                     step = i - prev
 496                     start = prev
 497                     continue
 498                 else:
 499                     yield 's[%d]' % prev
 500             if step is None:
 501                 yield 's[%d]' % i
 502             else:
 503                 yield _genslice(start, i, step)
 504
 505         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 506         cache_res = func(test_string)
 507         cache_spec = [ord(c) for c in cache_res]
 508         expr_code = ' + '.join(gen_sig_code(cache_spec))
 509         signature_id_tuple = '(%s)' % (
 510             ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 511         code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 512                 '    return %s\n') % (signature_id_tuple, expr_code)
 513         self.to_screen('Extracted signature function:\n' + code)
 514
 515     def _parse_sig_js(self, jscode):
 516         funcname = self._search_regex(
 517             r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
 518             'Initial JS player signature function name')
 519
 520         jsi = JSInterpreter(jscode)
 521         initial_function = jsi.extract_function(funcname)
 522         return lambda s: initial_function([s])
 523
 524     def _parse_sig_swf(self, file_contents):
 525         swfi = SWFInterpreter(file_contents)
 526         TARGET_CLASSNAME = 'SignatureDecipher'
 527         searched_class = swfi.extract_class(TARGET_CLASSNAME)
 528         initial_function = swfi.extract_function(searched_class, 'decipher')
 529         return lambda s: initial_function([s])
 530
 531     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 532         """Turn the encrypted s field into a working signature"""
 533
 534         if player_url is None:
 535             raise ExtractorError('Cannot decrypt signature without player_url')
 536
 537         if player_url.startswith('//'):
 538             player_url = 'https:' + player_url
 539         try:
 540             player_id = (player_url, self._signature_cache_id(s))
 541             if player_id not in self._player_cache:
 542                 func = self._extract_signature_function(
 543                     video_id, player_url, s
 544                 )
 545                 self._player_cache[player_id] = func
 546             func = self._player_cache[player_id]
 547             if self._downloader.params.get('youtube_print_sig_code'):
 548                 self._print_sig_code(func, s)
 549             return func(s)
 550         except Exception as e:
 551             tb = traceback.format_exc()
 552             raise ExtractorError(
 553                 'Signature extraction failed: ' + tb, cause=e)
 554
 555     def _get_available_subtitles(self, video_id, webpage):
 556         try:
 557             sub_list = self._download_webpage(
 558                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 559                 video_id, note=False)
 560         except ExtractorError as err:
 561             self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
 562             return {}
 563         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 564
 565         sub_lang_list = {}
 566         for l in lang_list:
 567             lang = l[1]
 568             if lang in sub_lang_list:
 569                 continue
 570             params = compat_urllib_parse.urlencode({
 571                 'lang': lang,
 572                 'v': video_id,
 573                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 574                 'name': unescapeHTML(l[0]).encode('utf-8'),
 575             })
 576             url = 'https://www.youtube.com/api/timedtext?' + params
 577             sub_lang_list[lang] = url
 578         if not sub_lang_list:
 579             self._downloader.report_warning('video doesn\'t have subtitles')
 580             return {}
 581         return sub_lang_list
 582
 583     def _get_available_automatic_caption(self, video_id, webpage):
 584         """We need the webpage for getting the captions url, pass it as an
 585            argument to speed up the process."""
 586         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 587         self.to_screen('%s: Looking for automatic captions' % video_id)
 588         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 589         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
 590         if mobj is None:
 591             self._downloader.report_warning(err_msg)
 592             return {}
 593         player_config = json.loads(mobj.group(1))
 594         try:
 595             args = player_config['args']
 596             caption_url = args['ttsurl']
 597             timestamp = args['timestamp']
 598             # We get the available subtitles
 599             list_params = compat_urllib_parse.urlencode({
 600                 'type': 'list',
 601                 'tlangs': 1,
 602                 'asrs': 1,
 603             })
 604             list_url = caption_url + '&' + list_params
 605             caption_list = self._download_xml(list_url, video_id)
 606             original_lang_node = caption_list.find('track')
 607             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
 608                 self._downloader.report_warning('Video doesn\'t have automatic captions')
 609                 return {}
 610             original_lang = original_lang_node.attrib['lang_code']
 611
 612             sub_lang_list = {}
 613             for lang_node in caption_list.findall('target'):
 614                 sub_lang = lang_node.attrib['lang_code']
 615                 params = compat_urllib_parse.urlencode({
 616                     'lang': original_lang,
 617                     'tlang': sub_lang,
 618                     'fmt': sub_format,
 619                     'ts': timestamp,
 620                     'kind': 'asr',
 621                 })
 622                 sub_lang_list[sub_lang] = caption_url + '&' + params
 623             return sub_lang_list
 624         # An extractor error can be raise by the download process if there are
 625         # no automatic captions but there are subtitles
 626         except (KeyError, ExtractorError):
 627             self._downloader.report_warning(err_msg)
 628             return {}
 629
 630     @classmethod
 631     def extract_id(cls, url):
 632         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 633         if mobj is None:
 634             raise ExtractorError('Invalid URL: %s' % url)
 635         video_id = mobj.group(2)
 636         return video_id
 637
 638     def _extract_from_m3u8(self, manifest_url, video_id):
 639         url_map = {}
 640
 641         def _get_urls(_manifest):
 642             lines = _manifest.split('\n')
 643             urls = filter(lambda l: l and not l.startswith('#'),
 644                           lines)
 645             return urls
 646         manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
 647         formats_urls = _get_urls(manifest)
 648         for format_url in formats_urls:
 649             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 650             url_map[itag] = format_url
 651         return url_map
 652
 653     def _extract_annotations(self, video_id):
 654         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 655         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 656
 657     def _real_extract(self, url):
 658         proto = (
 659             'http' if self._downloader.params.get('prefer_insecure', False)
 660             else 'https')
 661
 662         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 663         mobj = re.search(self._NEXT_URL_RE, url)
 664         if mobj:
 665             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 666         video_id = self.extract_id(url)
 667
 668         # Get video webpage
 669         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
 670         pref_cookies = [
 671             c for c in self._downloader.cookiejar
 672             if c.domain == '.youtube.com' and c.name == 'PREF']
 673         for pc in pref_cookies:
 674             if 'hl=' in pc.value:
 675                 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
 676             else:
 677                 if pc.value:
 678                     pc.value += '&'
 679                 pc.value += 'hl=en'
 680         video_webpage = self._download_webpage(url, video_id)
 681
 682         # Attempt to extract SWF player URL
 683         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 684         if mobj is not None:
 685             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 686         else:
 687             player_url = None
 688
 689         # Get video info
 690         self.report_video_info_webpage_download(video_id)
 691         if re.search(r'player-age-gate-content">', video_webpage) is not None:
 692             age_gate = True
 693             # We simulate the access to the video from www.youtube.com/v/{video_id}
 694             # this can be viewed without login into Youtube
 695             data = compat_urllib_parse.urlencode({
 696                 'video_id': video_id,
 697                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
 698                 'sts': self._search_regex(
 699                     r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
 700             })
 701             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
 702             video_info_webpage = self._download_webpage(
 703                 video_info_url, video_id,
 704                 note='Refetching age-gated info webpage',
 705                 errnote='unable to download video info webpage')
 706             video_info = compat_parse_qs(video_info_webpage)
 707         else:
 708             age_gate = False
 709             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 710                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 711                                   % (video_id, el_type))
 712                 video_info_webpage = self._download_webpage(video_info_url, video_id,
 713                                                             note=False,
 714                                                             errnote='unable to download video info webpage')
 715                 video_info = compat_parse_qs(video_info_webpage)
 716                 if 'token' in video_info:
 717                     break
 718         if 'token' not in video_info:
 719             if 'reason' in video_info:
 720                 raise ExtractorError(
 721                     'YouTube said: %s' % video_info['reason'][0],
 722                     expected=True, video_id=video_id)
 723             else:
 724                 raise ExtractorError(
 725                     '"token" parameter not in video info for unknown reason',
 726                     video_id=video_id)
 727
 728         if 'view_count' in video_info:
 729             view_count = int(video_info['view_count'][0])
 730         else:
 731             view_count = None
 732
 733         # Check for "rental" videos
 734         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 735             raise ExtractorError('"rental" videos not supported')
 736
 737         # Start extracting information
 738         self.report_information_extraction(video_id)
 739
 740         # uploader
 741         if 'author' not in video_info:
 742             raise ExtractorError('Unable to extract uploader name')
 743         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 744
 745         # uploader_id
 746         video_uploader_id = None
 747         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 748         if mobj is not None:
 749             video_uploader_id = mobj.group(1)
 750         else:
 751             self._downloader.report_warning('unable to extract uploader nickname')
 752
 753         # title
 754         if 'title' in video_info:
 755             video_title = video_info['title'][0]
 756         else:
 757             self._downloader.report_warning('Unable to extract video title')
 758             video_title = '_'
 759
 760         # thumbnail image
 761         # We try first to get a high quality image:
 762         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
 763                             video_webpage, re.DOTALL)
 764         if m_thumb is not None:
 765             video_thumbnail = m_thumb.group(1)
 766         elif 'thumbnail_url' not in video_info:
 767             self._downloader.report_warning('unable to extract video thumbnail')
 768             video_thumbnail = None
 769         else:   # don't panic if we can't find it
 770             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 771
 772         # upload date
 773         upload_date = None
 774         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
 775         if mobj is None:
 776             mobj = re.search(
 777                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
 778                 video_webpage)
 779         if mobj is not None:
 780             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 781             upload_date = unified_strdate(upload_date)
 782
 783         m_cat_container = self._search_regex(
 784             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
 785             video_webpage, 'categories', fatal=False)
 786         if m_cat_container:
 787             category = self._html_search_regex(
 788                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
 789                 default=None)
 790             video_categories = None if category is None else [category]
 791         else:
 792             video_categories = None
 793
 794         # description
 795         video_description = get_element_by_id("eow-description", video_webpage)
 796         if video_description:
 797             video_description = re.sub(r'''(?x)
 798                 <a\s+
 799                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 800                     title="([^"]+)"\s+
 801                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 802                     class="yt-uix-redirect-link"\s*>
 803                 [^<]+
 804                 </a>
 805             ''', r'\1', video_description)
 806             video_description = clean_html(video_description)
 807         else:
 808             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 809             if fd_mobj:
 810                 video_description = unescapeHTML(fd_mobj.group(1))
 811             else:
 812                 video_description = ''
 813
 814         def _extract_count(count_name):
 815             count = self._search_regex(
 816                 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
 817                 video_webpage, count_name, default=None)
 818             if count is not None:
 819                 return int(count.replace(',', ''))
 820             return None
 821         like_count = _extract_count('like')
 822         dislike_count = _extract_count('dislike')
 823
 824         # subtitles
 825         video_subtitles = self.extract_subtitles(video_id, video_webpage)
 826
 827         if self._downloader.params.get('listsubtitles', False):
 828             self._list_available_subtitles(video_id, video_webpage)
 829             return
 830
 831         if 'length_seconds' not in video_info:
 832             self._downloader.report_warning('unable to extract video duration')
 833             video_duration = None
 834         else:
 835             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
 836
 837         # annotations
 838         video_annotations = None
 839         if self._downloader.params.get('writeannotations', False):
 840             video_annotations = self._extract_annotations(video_id)
 841
 842         # Decide which formats to download
 843         try:
 844             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
 845             if not mobj:
 846                 raise ValueError('Could not find vevo ID')
 847             json_code = uppercase_escape(mobj.group(1))
 848             ytplayer_config = json.loads(json_code)
 849             args = ytplayer_config['args']
 850             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
 851             # this signatures are encrypted
 852             if 'url_encoded_fmt_stream_map' not in args:
 853                 raise ValueError('No stream_map present')  # caught below
 854             re_signature = re.compile(r'[&,]s=')
 855             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
 856             if m_s is not None:
 857                 self.to_screen('%s: Encrypted signatures detected.' % video_id)
 858                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
 859             m_s = re_signature.search(args.get('adaptive_fmts', ''))
 860             if m_s is not None:
 861                 if 'adaptive_fmts' in video_info:
 862                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
 863                 else:
 864                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
 865         except ValueError:
 866             pass
 867
 868         def _map_to_format_list(urlmap):
 869             formats = []
 870             for itag, video_real_url in urlmap.items():
 871                 dct = {
 872                     'format_id': itag,
 873                     'url': video_real_url,
 874                     'player_url': player_url,
 875                 }
 876                 if itag in self._formats:
 877                     dct.update(self._formats[itag])
 878                 formats.append(dct)
 879             return formats
 880
 881         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 882             self.report_rtmp_download()
 883             formats = [{
 884                 'format_id': '_rtmp',
 885                 'protocol': 'rtmp',
 886                 'url': video_info['conn'][0],
 887                 'player_url': player_url,
 888             }]
 889         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
 890             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
 891             if 'rtmpe%3Dyes' in encoded_url_map:
 892                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
 893             url_map = {}
 894             for url_data_str in encoded_url_map.split(','):
 895                 url_data = compat_parse_qs(url_data_str)
 896                 if 'itag' not in url_data or 'url' not in url_data:
 897                     continue
 898                 format_id = url_data['itag'][0]
 899                 url = url_data['url'][0]
 900
 901                 if 'sig' in url_data:
 902                     url += '&signature=' + url_data['sig'][0]
 903                 elif 's' in url_data:
 904                     encrypted_sig = url_data['s'][0]
 905
 906                     if not age_gate:
 907                         jsplayer_url_json = self._search_regex(
 908                             r'"assets":.+?"js":\s*("[^"]+")',
 909                             video_webpage, 'JS player URL')
 910                         player_url = json.loads(jsplayer_url_json)
 911                     if player_url is None:
 912                         player_url_json = self._search_regex(
 913                             r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
 914                             video_webpage, 'age gate player URL')
 915                         player_url = json.loads(player_url_json)
 916
 917                     if self._downloader.params.get('verbose'):
 918                         if player_url is None:
 919                             player_version = 'unknown'
 920                             player_desc = 'unknown'
 921                         else:
 922                             if player_url.endswith('swf'):
 923                                 player_version = self._search_regex(
 924                                     r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
 925                                     'flash player', fatal=False)
 926                                 player_desc = 'flash player %s' % player_version
 927                             else:
 928                                 player_version = self._search_regex(
 929                                     r'html5player-([^/]+?)(?:/html5player)?\.js',
 930                                     player_url,
 931                                     'html5 player', fatal=False)
 932                                 player_desc = 'html5 player %s' % player_version
 933
 934                         parts_sizes = self._signature_cache_id(encrypted_sig)
 935                         self.to_screen('{%s} signature length %s, %s' %
 936                                        (format_id, parts_sizes, player_desc))
 937
 938                     signature = self._decrypt_signature(
 939                         encrypted_sig, video_id, player_url, age_gate)
 940                     url += '&signature=' + signature
 941                 if 'ratebypass' not in url:
 942                     url += '&ratebypass=yes'
 943                 url_map[format_id] = url
 944             formats = _map_to_format_list(url_map)
 945         elif video_info.get('hlsvp'):
 946             manifest_url = video_info['hlsvp'][0]
 947             url_map = self._extract_from_m3u8(manifest_url, video_id)
 948             formats = _map_to_format_list(url_map)
 949         else:
 950             raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
 951
 952         # Look for the DASH manifest
 953         if self._downloader.params.get('youtube_include_dash_manifest', True):
 954             try:
 955                 # The DASH manifest used needs to be the one from the original video_webpage.
 956                 # The one found in get_video_info seems to be using different signatures.
 957                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
 958                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
 959                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
 960                 if age_gate:
 961                     dash_manifest_url = video_info.get('dashmpd')[0]
 962                 else:
 963                     dash_manifest_url = ytplayer_config['args']['dashmpd']
 964
 965                 def decrypt_sig(mobj):
 966                     s = mobj.group(1)
 967                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
 968                     return '/signature/%s' % dec_s
 969                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
 970                 dash_doc = self._download_xml(
 971                     dash_manifest_url, video_id,
 972                     note='Downloading DASH manifest',
 973                     errnote='Could not download DASH manifest')
 974                 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 975                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 976                     if url_el is None:
 977                         continue
 978                     format_id = r.attrib['id']
 979                     video_url = url_el.text
 980                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 981                     f = {
 982                         'format_id': format_id,
 983                         'url': video_url,
 984                         'width': int_or_none(r.attrib.get('width')),
 985                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
 986                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
 987                         'filesize': filesize,
 988                     }
 989                     try:
 990                         existing_format = next(
 991                             fo for fo in formats
 992                             if fo['format_id'] == format_id)
 993                     except StopIteration:
 994                         f.update(self._formats.get(format_id, {}))
 995                         formats.append(f)
 996                     else:
 997                         existing_format.update(f)
 998
 999             except (ExtractorError, KeyError) as e:
1000                 self.report_warning('Skipping DASH manifest: %r' % e, video_id)
1001
1002         self._sort_formats(formats)
1003
1004         return {
1005             'id': video_id,
1006             'uploader': video_uploader,
1007             'uploader_id': video_uploader_id,
1008             'upload_date': upload_date,
1009             'title': video_title,
1010             'thumbnail': video_thumbnail,
1011             'description': video_description,
1012             'categories': video_categories,
1013             'subtitles': video_subtitles,
1014             'duration': video_duration,
1015             'age_limit': 18 if age_gate else 0,
1016             'annotations': video_annotations,
1017             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1018             'view_count': view_count,
1019             'like_count': like_count,
1020             'dislike_count': dislike_count,
1021             'formats': formats,
1022         }
1023
1024
1025 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1026     IE_DESC = 'YouTube.com playlists'
1027     _VALID_URL = r"""(?x)(?:
1028                         (?:https?://)?
1029                         (?:\w+\.)?
1030                         youtube\.com/
1031                         (?:
1032                            (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1033                            \? (?:.*?&)*? (?:p|a|list)=
1034                         |  p/
1035                         )
1036                         (
1037                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1038                             # Top tracks, they can also include dots
1039                             |(?:MC)[\w\.]*
1040                         )
1041                         .*
1042                      |
1043                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1044                      )"""
1045     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1046     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1047     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1048     IE_NAME = 'youtube:playlist'
1049     _TESTS = [{
1050         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1051         'info_dict': {
1052             'title': 'ytdl test PL',
1053             'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1054         },
1055         'playlist_count': 3,
1056     }, {
1057         'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1058         'info_dict': {
1059             'title': 'YDL_Empty_List',
1060         },
1061         'playlist_count': 0,
1062     }, {
1063         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1064         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1065         'info_dict': {
1066             'title': '29C3: Not my department',
1067         },
1068         'playlist_count': 95,
1069     }, {
1070         'note': 'issue #673',
1071         'url': 'PLBB231211A4F62143',
1072         'info_dict': {
1073             'title': '[OLD]Team Fortress 2 (Class-based LP)',
1074         },
1075         'playlist_mincount': 26,
1076     }, {
1077         'note': 'Large playlist',
1078         'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1079         'info_dict': {
1080             'title': 'Uploads from Cauchemar',
1081         },
1082         'playlist_mincount': 799,
1083     }, {
1084         'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1085         'info_dict': {
1086             'title': 'YDL_safe_search',
1087         },
1088         'playlist_count': 2,
1089     }, {
1090         'note': 'embedded',
1091         'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1092         'playlist_count': 4,
1093         'info_dict': {
1094             'title': 'JODA15',
1095         }
1096     }, {
1097         'note': 'Embedded SWF player',
1098         'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1099         'playlist_count': 4,
1100         'info_dict': {
1101             'title': 'JODA7',
1102         }
1103     }]
1104
1105     def _real_initialize(self):
1106         self._login()
1107
1108     def _ids_to_results(self, ids):
1109         return [
1110             self.url_result(vid_id, 'Youtube', video_id=vid_id)
1111             for vid_id in ids]
1112
1113     def _extract_mix(self, playlist_id):
1114         # The mixes are generated from a a single video
1115         # the id of the playlist is just 'RD' + video_id
1116         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1117         webpage = self._download_webpage(
1118             url, playlist_id, 'Downloading Youtube mix')
1119         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1120         title_span = (
1121             search_title('playlist-title') or
1122             search_title('title long-title') or
1123             search_title('title'))
1124         title = clean_html(title_span)
1125         ids = orderedSet(re.findall(
1126             r'''(?xs)data-video-username=".*?".*?
1127                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1128             webpage))
1129         url_results = self._ids_to_results(ids)
1130
1131         return self.playlist_result(url_results, playlist_id, title)
1132
1133     def _real_extract(self, url):
1134         # Extract playlist id
1135         mobj = re.match(self._VALID_URL, url)
1136         if mobj is None:
1137             raise ExtractorError('Invalid URL: %s' % url)
1138         playlist_id = mobj.group(1) or mobj.group(2)
1139
1140         # Check if it's a video-specific URL
1141         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1142         if 'v' in query_dict:
1143             video_id = query_dict['v'][0]
1144             if self._downloader.params.get('noplaylist'):
1145                 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1146                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1147             else:
1148                 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1149
1150         if playlist_id.startswith('RD'):
1151             # Mixes require a custom extraction process
1152             return self._extract_mix(playlist_id)
1153         if playlist_id.startswith('TL'):
1154             raise ExtractorError('For downloading YouTube.com top lists, use '
1155                                  'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1156
1157         url = self._TEMPLATE_URL % playlist_id
1158         page = self._download_webpage(url, playlist_id)
1159         more_widget_html = content_html = page
1160
1161         # Check if the playlist exists or is private
1162         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1163             raise ExtractorError(
1164                 'The playlist doesn\'t exist or is private, use --username or '
1165                 '--netrc to access it.',
1166                 expected=True)
1167
1168         # Extract the video ids from the playlist pages
1169         ids = []
1170
1171         for page_num in itertools.count(1):
1172             matches = re.finditer(self._VIDEO_RE, content_html)
1173             # We remove the duplicates and the link with index 0
1174             # (it's not the first video of the playlist)
1175             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1176             ids.extend(new_ids)
1177
1178             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1179             if not mobj:
1180                 break
1181
1182             more = self._download_json(
1183                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1184                 'Downloading page #%s' % page_num,
1185                 transform_source=uppercase_escape)
1186             content_html = more['content_html']
1187             more_widget_html = more['load_more_widget_html']
1188
1189         playlist_title = self._html_search_regex(
1190             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1191             page, 'title')
1192
1193         url_results = self._ids_to_results(ids)
1194         return self.playlist_result(url_results, playlist_id, playlist_title)
1195
1196
1197 class YoutubeTopListIE(YoutubePlaylistIE):
1198     IE_NAME = 'youtube:toplist'
1199     IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1200                ' (Example: "yttoplist:music:Top Tracks")')
1201     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1202     _TESTS = [{
1203         'url': 'yttoplist:music:Trending',
1204         'playlist_mincount': 5,
1205         'skip': 'Only works for logged-in users',
1206     }]
1207
1208     def _real_extract(self, url):
1209         mobj = re.match(self._VALID_URL, url)
1210         channel = mobj.group('chann')
1211         title = mobj.group('title')
1212         query = compat_urllib_parse.urlencode({'title': title})
1213         channel_page = self._download_webpage(
1214             'https://www.youtube.com/%s' % channel, title)
1215         link = self._html_search_regex(
1216             r'''(?x)
1217                 <a\s+href="([^"]+)".*?>\s*
1218                 <span\s+class="branded-page-module-title-text">\s*
1219                 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1220             channel_page, 'list')
1221         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1222
1223         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1224         ids = []
1225         # sometimes the webpage doesn't contain the videos
1226         # retry until we get them
1227         for i in itertools.count(0):
1228             msg = 'Downloading Youtube mix'
1229             if i > 0:
1230                 msg += ', retry #%d' % i
1231
1232             webpage = self._download_webpage(url, title, msg)
1233             ids = orderedSet(re.findall(video_re, webpage))
1234             if ids:
1235                 break
1236         url_results = self._ids_to_results(ids)
1237         return self.playlist_result(url_results, playlist_title=title)
1238
1239
1240 class YoutubeChannelIE(InfoExtractor):
1241     IE_DESC = 'YouTube.com channels'
1242     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1243     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1244     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1245     IE_NAME = 'youtube:channel'
1246     _TESTS = [{
1247         'note': 'paginated channel',
1248         'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1249         'playlist_mincount': 91,
1250     }]
1251
1252     def extract_videos_from_page(self, page):
1253         ids_in_page = []
1254         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1255             if mobj.group(1) not in ids_in_page:
1256                 ids_in_page.append(mobj.group(1))
1257         return ids_in_page
1258
1259     def _real_extract(self, url):
1260         # Extract channel id
1261         mobj = re.match(self._VALID_URL, url)
1262         if mobj is None:
1263             raise ExtractorError('Invalid URL: %s' % url)
1264
1265         # Download channel page
1266         channel_id = mobj.group(1)
1267         video_ids = []
1268         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1269         channel_page = self._download_webpage(url, channel_id)
1270         autogenerated = re.search(r'''(?x)
1271                 class="[^"]*?(?:
1272                     channel-header-autogenerated-label|
1273                     yt-channel-title-autogenerated
1274                 )[^"]*"''', channel_page) is not None
1275
1276         if autogenerated:
1277             # The videos are contained in a single page
1278             # the ajax pages can't be used, they are empty
1279             video_ids = self.extract_videos_from_page(channel_page)
1280         else:
1281             # Download all channel pages using the json-based channel_ajax query
1282             for pagenum in itertools.count(1):
1283                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1284                 page = self._download_json(
1285                     url, channel_id, note='Downloading page #%s' % pagenum,
1286                     transform_source=uppercase_escape)
1287
1288                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1289                 video_ids.extend(ids_in_page)
1290
1291                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1292                     break
1293
1294         self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1295
1296         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1297                        for video_id in video_ids]
1298         return self.playlist_result(url_entries, channel_id)
1299
1300
1301 class YoutubeUserIE(InfoExtractor):
1302     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1303     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1304     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1305     _GDATA_PAGE_SIZE = 50
1306     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1307     IE_NAME = 'youtube:user'
1308
1309     _TESTS = [{
1310         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1311         'playlist_mincount': 320,
1312         'info_dict': {
1313             'title': 'TheLinuxFoundation',
1314         }
1315     }, {
1316         'url': 'ytuser:phihag',
1317         'only_matching': True,
1318     }]
1319
1320     @classmethod
1321     def suitable(cls, url):
1322         # Don't return True if the url can be extracted with other youtube
1323         # extractor, the regex would is too permissive and it would match.
1324         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1325         if any(ie.suitable(url) for ie in other_ies):
1326             return False
1327         else:
1328             return super(YoutubeUserIE, cls).suitable(url)
1329
1330     def _real_extract(self, url):
1331         # Extract username
1332         mobj = re.match(self._VALID_URL, url)
1333         if mobj is None:
1334             raise ExtractorError('Invalid URL: %s' % url)
1335
1336         username = mobj.group(1)
1337
1338         # Download video ids using YouTube Data API. Result size per
1339         # query is limited (currently to 50 videos) so we need to query
1340         # page by page until there are no video ids - it means we got
1341         # all of them.
1342
1343         def download_page(pagenum):
1344             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1345
1346             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1347             page = self._download_webpage(
1348                 gdata_url, username,
1349                 'Downloading video ids from %d to %d' % (
1350                     start_index, start_index + self._GDATA_PAGE_SIZE))
1351
1352             try:
1353                 response = json.loads(page)
1354             except ValueError as err:
1355                 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1356             if 'entry' not in response['feed']:
1357                 return
1358
1359             # Extract video identifiers
1360             entries = response['feed']['entry']
1361             for entry in entries:
1362                 title = entry['title']['$t']
1363                 video_id = entry['id']['$t'].split('/')[-1]
1364                 yield {
1365                     '_type': 'url',
1366                     'url': video_id,
1367                     'ie_key': 'Youtube',
1368                     'id': video_id,
1369                     'title': title,
1370                 }
1371         url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1372
1373         return self.playlist_result(url_results, playlist_title=username)
1374
1375
1376 class YoutubeSearchIE(SearchInfoExtractor):
1377     IE_DESC = 'YouTube.com searches'
1378     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1379     _MAX_RESULTS = 1000
1380     IE_NAME = 'youtube:search'
1381     _SEARCH_KEY = 'ytsearch'
1382
1383     def _get_n_results(self, query, n):
1384         """Get a specified number of results for a query"""
1385
1386         video_ids = []
1387         pagenum = 0
1388         limit = n
1389         PAGE_SIZE = 50
1390
1391         while (PAGE_SIZE * pagenum) < limit:
1392             result_url = self._API_URL % (
1393                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1394                 (PAGE_SIZE * pagenum) + 1)
1395             data_json = self._download_webpage(
1396                 result_url, video_id='query "%s"' % query,
1397                 note='Downloading page %s' % (pagenum + 1),
1398                 errnote='Unable to download API page')
1399             data = json.loads(data_json)
1400             api_response = data['data']
1401
1402             if 'items' not in api_response:
1403                 raise ExtractorError(
1404                     '[youtube] No video results', expected=True)
1405
1406             new_ids = list(video['id'] for video in api_response['items'])
1407             video_ids += new_ids
1408
1409             limit = min(n, api_response['totalItems'])
1410             pagenum += 1
1411
1412         if len(video_ids) > n:
1413             video_ids = video_ids[:n]
1414         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1415                   for video_id in video_ids]
1416         return self.playlist_result(videos, query)
1417
1418
1419 class YoutubeSearchDateIE(YoutubeSearchIE):
1420     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1421     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1422     _SEARCH_KEY = 'ytsearchdate'
1423     IE_DESC = 'YouTube.com searches, newest videos first'
1424
1425
1426 class YoutubeSearchURLIE(InfoExtractor):
1427     IE_DESC = 'YouTube.com search URLs'
1428     IE_NAME = 'youtube:search_url'
1429     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1430     _TESTS = [{
1431         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1432         'playlist_mincount': 5,
1433         'info_dict': {
1434             'title': 'youtube-dl test video',
1435         }
1436     }]
1437
1438     def _real_extract(self, url):
1439         mobj = re.match(self._VALID_URL, url)
1440         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1441
1442         webpage = self._download_webpage(url, query)
1443         result_code = self._search_regex(
1444             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1445
1446         part_codes = re.findall(
1447             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1448         entries = []
1449         for part_code in part_codes:
1450             part_title = self._html_search_regex(
1451                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1452             part_url_snippet = self._html_search_regex(
1453                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1454             part_url = compat_urlparse.urljoin(
1455                 'https://www.youtube.com/', part_url_snippet)
1456             entries.append({
1457                 '_type': 'url',
1458                 'url': part_url,
1459                 'title': part_title,
1460             })
1461
1462         return {
1463             '_type': 'playlist',
1464             'entries': entries,
1465             'title': query,
1466         }
1467
1468
1469 class YoutubeShowIE(InfoExtractor):
1470     IE_DESC = 'YouTube.com (multi-season) shows'
1471     _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1472     IE_NAME = 'youtube:show'
1473     _TESTS = [{
1474         'url': 'http://www.youtube.com/show/airdisasters',
1475         'playlist_mincount': 3,
1476         'info_dict': {
1477             'id': 'airdisasters',
1478             'title': 'Air Disasters',
1479         }
1480     }]
1481
1482     def _real_extract(self, url):
1483         mobj = re.match(self._VALID_URL, url)
1484         playlist_id = mobj.group('id')
1485         webpage = self._download_webpage(
1486             url, playlist_id, 'Downloading show webpage')
1487         # There's one playlist for each season of the show
1488         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1489         self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1490         entries = [
1491             self.url_result(
1492                 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1493             for season in m_seasons
1494         ]
1495         title = self._og_search_title(webpage, fatal=False)
1496
1497         return {
1498             '_type': 'playlist',
1499             'id': playlist_id,
1500             'title': title,
1501             'entries': entries,
1502         }
1503
1504
1505 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1506     """
1507     Base class for extractors that fetch info from
1508     http://www.youtube.com/feed_ajax
1509     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1510     """
1511     _LOGIN_REQUIRED = True
1512     # use action_load_personal_feed instead of action_load_system_feed
1513     _PERSONAL_FEED = False
1514
1515     @property
1516     def _FEED_TEMPLATE(self):
1517         action = 'action_load_system_feed'
1518         if self._PERSONAL_FEED:
1519             action = 'action_load_personal_feed'
1520         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1521
1522     @property
1523     def IE_NAME(self):
1524         return 'youtube:%s' % self._FEED_NAME
1525
1526     def _real_initialize(self):
1527         self._login()
1528
1529     def _real_extract(self, url):
1530         feed_entries = []
1531         paging = 0
1532         for i in itertools.count(1):
1533             info = self._download_json(self._FEED_TEMPLATE % paging,
1534                                        '%s feed' % self._FEED_NAME,
1535                                        'Downloading page %s' % i)
1536             feed_html = info.get('feed_html') or info.get('content_html')
1537             load_more_widget_html = info.get('load_more_widget_html') or feed_html
1538             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1539             ids = orderedSet(m.group(1) for m in m_ids)
1540             feed_entries.extend(
1541                 self.url_result(video_id, 'Youtube', video_id=video_id)
1542                 for video_id in ids)
1543             mobj = re.search(
1544                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1545                 load_more_widget_html)
1546             if mobj is None:
1547                 break
1548             paging = mobj.group('paging')
1549         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1550
1551
1552 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1553     IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1554     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1555     _FEED_NAME = 'recommended'
1556     _PLAYLIST_TITLE = 'Youtube Recommended videos'
1557
1558
1559 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1560     IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1561     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1562     _FEED_NAME = 'watch_later'
1563     _PLAYLIST_TITLE = 'Youtube Watch Later'
1564     _PERSONAL_FEED = True
1565
1566
1567 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1568     IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1569     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1570     _FEED_NAME = 'history'
1571     _PERSONAL_FEED = True
1572     _PLAYLIST_TITLE = 'Youtube Watch History'
1573
1574
1575 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1576     IE_NAME = 'youtube:favorites'
1577     IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1578     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1579     _LOGIN_REQUIRED = True
1580
1581     def _real_extract(self, url):
1582         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1583         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1584         return self.url_result(playlist_id, 'YoutubePlaylist')
1585
1586
1587 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1588     IE_NAME = 'youtube:subscriptions'
1589     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1590     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1591     _TESTS = []
1592
1593     def _real_extract(self, url):
1594         title = 'Youtube Subscriptions'
1595         page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1596
1597         # The extraction process is the same as for playlists, but the regex
1598         # for the video ids doesn't contain an index
1599         ids = []
1600         more_widget_html = content_html = page
1601
1602         for page_num in itertools.count(1):
1603             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1604             new_ids = orderedSet(matches)
1605             ids.extend(new_ids)
1606
1607             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1608             if not mobj:
1609                 break
1610
1611             more = self._download_json(
1612                 'https://youtube.com/%s' % mobj.group('more'), title,
1613                 'Downloading page #%s' % page_num,
1614                 transform_source=uppercase_escape)
1615             content_html = more['content_html']
1616             more_widget_html = more['load_more_widget_html']
1617
1618         return {
1619             '_type': 'playlist',
1620             'title': title,
1621             'entries': self._ids_to_results(ids),
1622         }
1623
1624
1625 class YoutubeTruncatedURLIE(InfoExtractor):
1626     IE_NAME = 'youtube:truncated_url'
1627     IE_DESC = False  # Do not list
1628     _VALID_URL = r'''(?x)
1629         (?:https?://)?[^/]+/watch\?(?:
1630             feature=[a-z_]+|
1631             annotation_id=annotation_[^&]+
1632         )?$|
1633         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1634     '''
1635
1636     _TESTS = [{
1637         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1638         'only_matching': True,
1639     }, {
1640         'url': 'http://www.youtube.com/watch?',
1641         'only_matching': True,
1642     }]
1643
1644     def _real_extract(self, url):
1645         raise ExtractorError(
1646             'Did you forget to quote the URL? Remember that & is a meta '
1647             'character in most shells, so you want to put the URL in quotes, '
1648             'like  youtube-dl '
1649             '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1650             ' or simply  youtube-dl BaW_jenozKc  .',
1651             expected=True)