_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import traceback
  11
  12 from .common import InfoExtractor, SearchInfoExtractor
  13 from .subtitles import SubtitlesInfoExtractor
  14 from ..jsinterp import JSInterpreter
  15 from ..swfinterp import SWFInterpreter
  16 from ..utils import (
  17     compat_chr,
  18     compat_parse_qs,
  19     compat_urllib_parse,
  20     compat_urllib_request,
  21     compat_urlparse,
  22     compat_str,
  23
  24     clean_html,
  25     get_element_by_id,
  26     get_element_by_attribute,
  27     ExtractorError,
  28     int_or_none,
  29     OnDemandPagedList,
  30     unescapeHTML,
  31     unified_strdate,
  32     orderedSet,
  33     uppercase_escape,
  34 )
  35
  36 class YoutubeBaseInfoExtractor(InfoExtractor):
  37     """Provide base functions for Youtube extractors"""
  38     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  39     _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
  40     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  41     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  42     _NETRC_MACHINE = 'youtube'
  43     # If True it will raise an error if no login info is provided
  44     _LOGIN_REQUIRED = False
  45
  46     def _set_language(self):
  47         return bool(self._download_webpage(
  48             self._LANG_URL, None,
  49             note='Setting language', errnote='unable to set language',
  50             fatal=False))
  51
  52     def _login(self):
  53         """
  54         Attempt to log in to YouTube.
  55         True is returned if successful or skipped.
  56         False is returned if login failed.
  57
  58         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  59         """
  60         (username, password) = self._get_login_info()
  61         # No authentication to be performed
  62         if username is None:
  63             if self._LOGIN_REQUIRED:
  64                 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  65             return True
  66
  67         login_page = self._download_webpage(
  68             self._LOGIN_URL, None,
  69             note='Downloading login page',
  70             errnote='unable to fetch login page', fatal=False)
  71         if login_page is False:
  72             return
  73
  74         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  75                                   login_page, 'Login GALX parameter')
  76
  77         # Log in
  78         login_form_strs = {
  79                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  80                 'Email': username,
  81                 'GALX': galx,
  82                 'Passwd': password,
  83
  84                 'PersistentCookie': 'yes',
  85                 '_utf8': '霱',
  86                 'bgresponse': 'js_disabled',
  87                 'checkConnection': '',
  88                 'checkedDomains': 'youtube',
  89                 'dnConn': '',
  90                 'pstMsg': '0',
  91                 'rmShown': '1',
  92                 'secTok': '',
  93                 'signIn': 'Sign in',
  94                 'timeStmp': '',
  95                 'service': 'youtube',
  96                 'uilel': '3',
  97                 'hl': 'en_US',
  98         }
  99
 100         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 101         # chokes on unicode
 102         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 103         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 104
 105         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 106         login_results = self._download_webpage(
 107             req, None,
 108             note='Logging in', errnote='unable to log in', fatal=False)
 109         if login_results is False:
 110             return False
 111
 112         if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 113             raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 114
 115         # Two-Factor
 116         # TODO add SMS and phone call support - these require making a request and then prompting the user
 117
 118         if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
 119             tfa_code = self._get_tfa_info()
 120
 121             if tfa_code is None:
 122                 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
 123                 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 124                 return False
 125
 126             # Unlike the first login form, secTok and timeStmp are both required for the TFA form
 127
 128             match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 129             if match is None:
 130                 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
 131             secTok = match.group(1)
 132             match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 133             if match is None:
 134                 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
 135             timeStmp = match.group(1)
 136
 137             tfa_form_strs = {
 138                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 139                 'smsToken': '',
 140                 'smsUserPin': tfa_code,
 141                 'smsVerifyPin': 'Verify',
 142
 143                 'PersistentCookie': 'yes',
 144                 'checkConnection': '',
 145                 'checkedDomains': 'youtube',
 146                 'pstMsg': '1',
 147                 'secTok': secTok,
 148                 'timeStmp': timeStmp,
 149                 'service': 'youtube',
 150                 'hl': 'en_US',
 151             }
 152             tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
 153             tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 154
 155             tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 156             tfa_results = self._download_webpage(
 157                 tfa_req, None,
 158                 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
 159
 160             if tfa_results is False:
 161                 return False
 162
 163             if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
 164                 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
 165                 return False
 166             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 167                 self._downloader.report_warning('unable to log in - did the page structure change?')
 168                 return False
 169             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 170                 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 171                 return False
 172
 173         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 174             self._downloader.report_warning('unable to log in: bad username or password')
 175             return False
 176         return True
 177
 178     def _confirm_age(self):
 179         age_form = {
 180             'next_url': '/',
 181             'action_confirm': 'Confirm',
 182         }
 183         req = compat_urllib_request.Request(self._AGE_URL,
 184             compat_urllib_parse.urlencode(age_form).encode('ascii'))
 185
 186         self._download_webpage(
 187             req, None,
 188             note='Confirming age', errnote='Unable to confirm age',
 189             fatal=False)
 190
 191     def _real_initialize(self):
 192         if self._downloader is None:
 193             return
 194         if self._get_login_info()[0] is not None:
 195             if not self._set_language():
 196                 return
 197         if not self._login():
 198             return
 199         self._confirm_age()
 200
 201
 202 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 203     IE_DESC = 'YouTube.com'
 204     _VALID_URL = r"""(?x)^
 205                      (
 206                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
 207                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 208                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 209                             (?:www\.)?pwnyoutube\.com/|
 210                             (?:www\.)?yourepeat\.com/|
 211                             tube\.majestyc\.net/|
 212                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 213                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 214                          (?:                                                  # the various things that can precede the ID:
 215                              (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
 216                              |(?:                                             # or the v= param in all its forms
 217                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 218                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 219                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 220                                  v=
 221                              )
 222                          ))
 223                          |youtu\.be/                                          # just youtu.be/xxxx
 224                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 225                          )
 226                      )?                                                       # all until now is optional -> you can pass the naked ID
 227                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 228                      (?!.*?&list=)                                            # combined list/video URLs are handled by the playlist IE
 229                      (?(1).+)?                                                # if we found the ID, everything can follow
 230                      $"""
 231     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 232     _formats = {
 233         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 234         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 235         '13': {'ext': '3gp'},
 236         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 237         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 238         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 239         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 240         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 241         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 242         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 243         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 244         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 245         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 246         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 247         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 248
 249
 250         # 3d videos
 251         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 252         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 253         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 254         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 255         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 256         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 257         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 258
 259         # Apple HTTP Live Streaming
 260         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 261         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 262         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 263         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 264         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 265         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 266         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 267
 268         # DASH mp4 video
 269         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 270         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 271         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 272         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 273         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 274         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 275         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 276         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 277
 278         # Dash mp4 audio
 279         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 280         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 281         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 282
 283         # Dash webm
 284         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 285         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 286         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 287         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 288         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 289         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 290         '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
 291         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 292         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 293         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 294         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 295         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 296         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 297         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 298         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 299         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 300         '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 301         '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 302
 303         # Dash webm audio
 304         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
 305         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 306
 307         # Dash mov
 308         '298': {'ext': 'mov', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 309         '299': {'ext': 'mov', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 310         '266': {'ext': 'mov', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
 311
 312         # RTMP (unnamed)
 313         '_rtmp': {'protocol': 'rtmp'},
 314     }
 315
 316     IE_NAME = 'youtube'
 317     _TESTS = [
 318         {
 319             'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
 320             'info_dict': {
 321                 'id': 'BaW_jenozKc',
 322                 'ext': 'mp4',
 323                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
 324                 'uploader': 'Philipp Hagemeister',
 325                 'uploader_id': 'phihag',
 326                 'upload_date': '20121002',
 327                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
 328                 'categories': ['Science & Technology'],
 329                 'like_count': int,
 330                 'dislike_count': int,
 331             }
 332         },
 333         {
 334             'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
 335             'note': 'Test generic use_cipher_signature video (#897)',
 336             'info_dict': {
 337                 'id': 'UxxajLWwzqY',
 338                 'ext': 'mp4',
 339                 'upload_date': '20120506',
 340                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
 341                 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
 342                 'uploader': 'Icona Pop',
 343                 'uploader_id': 'IconaPop',
 344             }
 345         },
 346         {
 347             'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
 348             'note': 'Test VEVO video with age protection (#956)',
 349             'info_dict': {
 350                 'id': '07FYdnEawAQ',
 351                 'ext': 'mp4',
 352                 'upload_date': '20130703',
 353                 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
 354                 'description': 'md5:64249768eec3bc4276236606ea996373',
 355                 'uploader': 'justintimberlakeVEVO',
 356                 'uploader_id': 'justintimberlakeVEVO',
 357             }
 358         },
 359         {
 360             'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
 361             'note': 'Embed-only video (#1746)',
 362             'info_dict': {
 363                 'id': 'yZIXLfi8CZQ',
 364                 'ext': 'mp4',
 365                 'upload_date': '20120608',
 366                 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
 367                 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
 368                 'uploader': 'SET India',
 369                 'uploader_id': 'setindia'
 370             }
 371         },
 372         {
 373             'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
 374             'note': '256k DASH audio (format 141) via DASH manifest',
 375             'info_dict': {
 376                 'id': 'a9LDPn-MO4I',
 377                 'ext': 'm4a',
 378                 'upload_date': '20121002',
 379                 'uploader_id': '8KVIDEO',
 380                 'description': '',
 381                 'uploader': '8KVIDEO',
 382                 'title': 'UHDTV TEST 8K VIDEO.mp4'
 383             },
 384             'params': {
 385                 'youtube_include_dash_manifest': True,
 386                 'format': '141',
 387             },
 388         },
 389         # DASH manifest with encrypted signature
 390         {
 391             'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 392             'info_dict': {
 393                 'id': 'IB3lcPjvWLA',
 394                 'ext': 'm4a',
 395                 'title': 'Afrojack - The Spark ft. Spree Wilson',
 396                 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
 397                 'uploader': 'AfrojackVEVO',
 398                 'uploader_id': 'AfrojackVEVO',
 399                 'upload_date': '20131011',
 400             },
 401             'params': {
 402                 'youtube_include_dash_manifest': True,
 403                 'format': '141',
 404             },
 405         },
 406     ]
 407
 408     def __init__(self, *args, **kwargs):
 409         super(YoutubeIE, self).__init__(*args, **kwargs)
 410         self._player_cache = {}
 411
 412     def report_video_info_webpage_download(self, video_id):
 413         """Report attempt to download video info webpage."""
 414         self.to_screen('%s: Downloading video info webpage' % video_id)
 415
 416     def report_information_extraction(self, video_id):
 417         """Report attempt to extract video information."""
 418         self.to_screen('%s: Extracting video information' % video_id)
 419
 420     def report_unavailable_format(self, video_id, format):
 421         """Report extracted video URL."""
 422         self.to_screen('%s: Format %s not available' % (video_id, format))
 423
 424     def report_rtmp_download(self):
 425         """Indicate the download will use the RTMP protocol."""
 426         self.to_screen('RTMP download detected')
 427
 428     def _signature_cache_id(self, example_sig):
 429         """ Return a string representation of a signature """
 430         return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 431
 432     def _extract_signature_function(self, video_id, player_url, example_sig):
 433         id_m = re.match(
 434             r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
 435             player_url)
 436         if not id_m:
 437             raise ExtractorError('Cannot identify player %r' % player_url)
 438         player_type = id_m.group('ext')
 439         player_id = id_m.group('id')
 440
 441         # Read from filesystem cache
 442         func_id = '%s_%s_%s' % (
 443             player_type, player_id, self._signature_cache_id(example_sig))
 444         assert os.path.basename(func_id) == func_id
 445
 446         cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
 447         if cache_spec is not None:
 448             return lambda s: ''.join(s[i] for i in cache_spec)
 449
 450         if player_type == 'js':
 451             code = self._download_webpage(
 452                 player_url, video_id,
 453                 note='Downloading %s player %s' % (player_type, player_id),
 454                 errnote='Download of %s failed' % player_url)
 455             res = self._parse_sig_js(code)
 456         elif player_type == 'swf':
 457             urlh = self._request_webpage(
 458                 player_url, video_id,
 459                 note='Downloading %s player %s' % (player_type, player_id),
 460                 errnote='Download of %s failed' % player_url)
 461             code = urlh.read()
 462             res = self._parse_sig_swf(code)
 463         else:
 464             assert False, 'Invalid player type %r' % player_type
 465
 466         if cache_spec is None:
 467             test_string = ''.join(map(compat_chr, range(len(example_sig))))
 468             cache_res = res(test_string)
 469             cache_spec = [ord(c) for c in cache_res]
 470
 471         self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
 472         return res
 473
 474     def _print_sig_code(self, func, example_sig):
 475         def gen_sig_code(idxs):
 476             def _genslice(start, end, step):
 477                 starts = '' if start == 0 else str(start)
 478                 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
 479                 steps = '' if step == 1 else (':%d' % step)
 480                 return 's[%s%s%s]' % (starts, ends, steps)
 481
 482             step = None
 483             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 484                                     # set as soon as step is set
 485             for i, prev in zip(idxs[1:], idxs[:-1]):
 486                 if step is not None:
 487                     if i - prev == step:
 488                         continue
 489                     yield _genslice(start, prev, step)
 490                     step = None
 491                     continue
 492                 if i - prev in [-1, 1]:
 493                     step = i - prev
 494                     start = prev
 495                     continue
 496                 else:
 497                     yield 's[%d]' % prev
 498             if step is None:
 499                 yield 's[%d]' % i
 500             else:
 501                 yield _genslice(start, i, step)
 502
 503         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 504         cache_res = func(test_string)
 505         cache_spec = [ord(c) for c in cache_res]
 506         expr_code = ' + '.join(gen_sig_code(cache_spec))
 507         signature_id_tuple = '(%s)' % (
 508             ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 509         code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 510                 '    return %s\n') % (signature_id_tuple, expr_code)
 511         self.to_screen('Extracted signature function:\n' + code)
 512
 513     def _parse_sig_js(self, jscode):
 514         funcname = self._search_regex(
 515             r'signature=([$a-zA-Z]+)', jscode,
 516              'Initial JS player signature function name')
 517
 518         jsi = JSInterpreter(jscode)
 519         initial_function = jsi.extract_function(funcname)
 520         return lambda s: initial_function([s])
 521
 522     def _parse_sig_swf(self, file_contents):
 523         swfi = SWFInterpreter(file_contents)
 524         TARGET_CLASSNAME = 'SignatureDecipher'
 525         searched_class = swfi.extract_class(TARGET_CLASSNAME)
 526         initial_function = swfi.extract_function(searched_class, 'decipher')
 527         return lambda s: initial_function([s])
 528
 529     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 530         """Turn the encrypted s field into a working signature"""
 531
 532         if player_url is None:
 533             raise ExtractorError('Cannot decrypt signature without player_url')
 534
 535         if player_url.startswith('//'):
 536             player_url = 'https:' + player_url
 537         try:
 538             player_id = (player_url, self._signature_cache_id(s))
 539             if player_id not in self._player_cache:
 540                 func = self._extract_signature_function(
 541                     video_id, player_url, s
 542                 )
 543                 self._player_cache[player_id] = func
 544             func = self._player_cache[player_id]
 545             if self._downloader.params.get('youtube_print_sig_code'):
 546                 self._print_sig_code(func, s)
 547             return func(s)
 548         except Exception as e:
 549             tb = traceback.format_exc()
 550             raise ExtractorError(
 551                 'Signature extraction failed: ' + tb, cause=e)
 552
 553     def _get_available_subtitles(self, video_id, webpage):
 554         try:
 555             sub_list = self._download_webpage(
 556                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 557                 video_id, note=False)
 558         except ExtractorError as err:
 559             self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
 560             return {}
 561         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 562
 563         sub_lang_list = {}
 564         for l in lang_list:
 565             lang = l[1]
 566             if lang in sub_lang_list:
 567                 continue
 568             params = compat_urllib_parse.urlencode({
 569                 'lang': lang,
 570                 'v': video_id,
 571                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 572                 'name': unescapeHTML(l[0]).encode('utf-8'),
 573             })
 574             url = 'https://www.youtube.com/api/timedtext?' + params
 575             sub_lang_list[lang] = url
 576         if not sub_lang_list:
 577             self._downloader.report_warning('video doesn\'t have subtitles')
 578             return {}
 579         return sub_lang_list
 580
 581     def _get_available_automatic_caption(self, video_id, webpage):
 582         """We need the webpage for getting the captions url, pass it as an
 583            argument to speed up the process."""
 584         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 585         self.to_screen('%s: Looking for automatic captions' % video_id)
 586         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 587         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
 588         if mobj is None:
 589             self._downloader.report_warning(err_msg)
 590             return {}
 591         player_config = json.loads(mobj.group(1))
 592         try:
 593             args = player_config[u'args']
 594             caption_url = args[u'ttsurl']
 595             timestamp = args[u'timestamp']
 596             # We get the available subtitles
 597             list_params = compat_urllib_parse.urlencode({
 598                 'type': 'list',
 599                 'tlangs': 1,
 600                 'asrs': 1,
 601             })
 602             list_url = caption_url + '&' + list_params
 603             caption_list = self._download_xml(list_url, video_id)
 604             original_lang_node = caption_list.find('track')
 605             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
 606                 self._downloader.report_warning('Video doesn\'t have automatic captions')
 607                 return {}
 608             original_lang = original_lang_node.attrib['lang_code']
 609
 610             sub_lang_list = {}
 611             for lang_node in caption_list.findall('target'):
 612                 sub_lang = lang_node.attrib['lang_code']
 613                 params = compat_urllib_parse.urlencode({
 614                     'lang': original_lang,
 615                     'tlang': sub_lang,
 616                     'fmt': sub_format,
 617                     'ts': timestamp,
 618                     'kind': 'asr',
 619                 })
 620                 sub_lang_list[sub_lang] = caption_url + '&' + params
 621             return sub_lang_list
 622         # An extractor error can be raise by the download process if there are
 623         # no automatic captions but there are subtitles
 624         except (KeyError, ExtractorError):
 625             self._downloader.report_warning(err_msg)
 626             return {}
 627
 628     @classmethod
 629     def extract_id(cls, url):
 630         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 631         if mobj is None:
 632             raise ExtractorError('Invalid URL: %s' % url)
 633         video_id = mobj.group(2)
 634         return video_id
 635
 636     def _extract_from_m3u8(self, manifest_url, video_id):
 637         url_map = {}
 638         def _get_urls(_manifest):
 639             lines = _manifest.split('\n')
 640             urls = filter(lambda l: l and not l.startswith('#'),
 641                             lines)
 642             return urls
 643         manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
 644         formats_urls = _get_urls(manifest)
 645         for format_url in formats_urls:
 646             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 647             url_map[itag] = format_url
 648         return url_map
 649
 650     def _extract_annotations(self, video_id):
 651         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 652         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 653
 654     def _real_extract(self, url):
 655         proto = (
 656             'http' if self._downloader.params.get('prefer_insecure', False)
 657             else 'https')
 658
 659         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 660         mobj = re.search(self._NEXT_URL_RE, url)
 661         if mobj:
 662             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 663         video_id = self.extract_id(url)
 664
 665         # Get video webpage
 666         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 667         pref_cookies = [
 668             c for c in self._downloader.cookiejar
 669             if c.domain == '.youtube.com' and c.name == 'PREF']
 670         for pc in pref_cookies:
 671             if 'hl=' in pc.value:
 672                 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
 673             else:
 674                 if pc.value:
 675                     pc.value += '&'
 676                 pc.value += 'hl=en'
 677         video_webpage = self._download_webpage(url, video_id)
 678
 679         # Attempt to extract SWF player URL
 680         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 681         if mobj is not None:
 682             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 683         else:
 684             player_url = None
 685
 686         # Get video info
 687         self.report_video_info_webpage_download(video_id)
 688         if re.search(r'player-age-gate-content">', video_webpage) is not None:
 689             self.report_age_confirmation()
 690             age_gate = True
 691             # We simulate the access to the video from www.youtube.com/v/{video_id}
 692             # this can be viewed without login into Youtube
 693             data = compat_urllib_parse.urlencode({
 694                 'video_id': video_id,
 695                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
 696                 'sts': self._search_regex(
 697                     r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
 698             })
 699             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
 700             video_info_webpage = self._download_webpage(video_info_url, video_id,
 701                                     note=False,
 702                                     errnote='unable to download video info webpage')
 703             video_info = compat_parse_qs(video_info_webpage)
 704         else:
 705             age_gate = False
 706             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 707                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 708                         % (video_id, el_type))
 709                 video_info_webpage = self._download_webpage(video_info_url, video_id,
 710                                         note=False,
 711                                         errnote='unable to download video info webpage')
 712                 video_info = compat_parse_qs(video_info_webpage)
 713                 if 'token' in video_info:
 714                     break
 715         if 'token' not in video_info:
 716             if 'reason' in video_info:
 717                 raise ExtractorError(
 718                     'YouTube said: %s' % video_info['reason'][0],
 719                     expected=True, video_id=video_id)
 720             else:
 721                 raise ExtractorError(
 722                     '"token" parameter not in video info for unknown reason',
 723                     video_id=video_id)
 724
 725         if 'view_count' in video_info:
 726             view_count = int(video_info['view_count'][0])
 727         else:
 728             view_count = None
 729
 730         # Check for "rental" videos
 731         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 732             raise ExtractorError('"rental" videos not supported')
 733
 734         # Start extracting information
 735         self.report_information_extraction(video_id)
 736
 737         # uploader
 738         if 'author' not in video_info:
 739             raise ExtractorError('Unable to extract uploader name')
 740         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 741
 742         # uploader_id
 743         video_uploader_id = None
 744         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 745         if mobj is not None:
 746             video_uploader_id = mobj.group(1)
 747         else:
 748             self._downloader.report_warning('unable to extract uploader nickname')
 749
 750         # title
 751         if 'title' in video_info:
 752             video_title = video_info['title'][0]
 753         else:
 754             self._downloader.report_warning('Unable to extract video title')
 755             video_title = '_'
 756
 757         # thumbnail image
 758         # We try first to get a high quality image:
 759         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
 760                             video_webpage, re.DOTALL)
 761         if m_thumb is not None:
 762             video_thumbnail = m_thumb.group(1)
 763         elif 'thumbnail_url' not in video_info:
 764             self._downloader.report_warning('unable to extract video thumbnail')
 765             video_thumbnail = None
 766         else:   # don't panic if we can't find it
 767             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 768
 769         # upload date
 770         upload_date = None
 771         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
 772         if mobj is None:
 773             mobj = re.search(
 774                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
 775                 video_webpage)
 776         if mobj is not None:
 777             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 778             upload_date = unified_strdate(upload_date)
 779
 780         m_cat_container = self._search_regex(
 781             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
 782             video_webpage, 'categories', fatal=False)
 783         if m_cat_container:
 784             category = self._html_search_regex(
 785                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
 786                 default=None)
 787             video_categories = None if category is None else [category]
 788         else:
 789             video_categories = None
 790
 791         # description
 792         video_description = get_element_by_id("eow-description", video_webpage)
 793         if video_description:
 794             video_description = re.sub(r'''(?x)
 795                 <a\s+
 796                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 797                     title="([^"]+)"\s+
 798                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 799                     class="yt-uix-redirect-link"\s*>
 800                 [^<]+
 801                 </a>
 802             ''', r'\1', video_description)
 803             video_description = clean_html(video_description)
 804         else:
 805             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 806             if fd_mobj:
 807                 video_description = unescapeHTML(fd_mobj.group(1))
 808             else:
 809                 video_description = ''
 810
 811         def _extract_count(count_name):
 812             count = self._search_regex(
 813                 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
 814                 video_webpage, count_name, default=None)
 815             if count is not None:
 816                 return int(count.replace(',', ''))
 817             return None
 818         like_count = _extract_count('like')
 819         dislike_count = _extract_count('dislike')
 820
 821         # subtitles
 822         video_subtitles = self.extract_subtitles(video_id, video_webpage)
 823
 824         if self._downloader.params.get('listsubtitles', False):
 825             self._list_available_subtitles(video_id, video_webpage)
 826             return
 827
 828         if 'length_seconds' not in video_info:
 829             self._downloader.report_warning('unable to extract video duration')
 830             video_duration = None
 831         else:
 832             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
 833
 834         # annotations
 835         video_annotations = None
 836         if self._downloader.params.get('writeannotations', False):
 837                 video_annotations = self._extract_annotations(video_id)
 838
 839         # Decide which formats to download
 840         try:
 841             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
 842             if not mobj:
 843                 raise ValueError('Could not find vevo ID')
 844             json_code = uppercase_escape(mobj.group(1))
 845             ytplayer_config = json.loads(json_code)
 846             args = ytplayer_config['args']
 847             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
 848             # this signatures are encrypted
 849             if 'url_encoded_fmt_stream_map' not in args:
 850                 raise ValueError('No stream_map present')  # caught below
 851             re_signature = re.compile(r'[&,]s=')
 852             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
 853             if m_s is not None:
 854                 self.to_screen('%s: Encrypted signatures detected.' % video_id)
 855                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
 856             m_s = re_signature.search(args.get('adaptive_fmts', ''))
 857             if m_s is not None:
 858                 if 'adaptive_fmts' in video_info:
 859                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
 860                 else:
 861                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
 862         except ValueError:
 863             pass
 864
 865         def _map_to_format_list(urlmap):
 866             formats = []
 867             for itag, video_real_url in urlmap.items():
 868                 dct = {
 869                     'format_id': itag,
 870                     'url': video_real_url,
 871                     'player_url': player_url,
 872                 }
 873                 if itag in self._formats:
 874                     dct.update(self._formats[itag])
 875                 formats.append(dct)
 876             return formats
 877
 878         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 879             self.report_rtmp_download()
 880             formats = [{
 881                 'format_id': '_rtmp',
 882                 'protocol': 'rtmp',
 883                 'url': video_info['conn'][0],
 884                 'player_url': player_url,
 885             }]
 886         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
 887             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
 888             if 'rtmpe%3Dyes' in encoded_url_map:
 889                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
 890             url_map = {}
 891             for url_data_str in encoded_url_map.split(','):
 892                 url_data = compat_parse_qs(url_data_str)
 893                 if 'itag' not in url_data or 'url' not in url_data:
 894                     continue
 895                 format_id = url_data['itag'][0]
 896                 url = url_data['url'][0]
 897
 898                 if 'sig' in url_data:
 899                     url += '&signature=' + url_data['sig'][0]
 900                 elif 's' in url_data:
 901                     encrypted_sig = url_data['s'][0]
 902
 903                     if not age_gate:
 904                         jsplayer_url_json = self._search_regex(
 905                             r'"assets":.+?"js":\s*("[^"]+")',
 906                             video_webpage, 'JS player URL')
 907                         player_url = json.loads(jsplayer_url_json)
 908                     if player_url is None:
 909                         player_url_json = self._search_regex(
 910                             r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
 911                             video_webpage, 'age gate player URL')
 912                         player_url = json.loads(player_url_json)
 913
 914                     if self._downloader.params.get('verbose'):
 915                         if player_url is None:
 916                             player_version = 'unknown'
 917                             player_desc = 'unknown'
 918                         else:
 919                             if player_url.endswith('swf'):
 920                                 player_version = self._search_regex(
 921                                     r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
 922                                     'flash player', fatal=False)
 923                                 player_desc = 'flash player %s' % player_version
 924                             else:
 925                                 player_version = self._search_regex(
 926                                     r'html5player-([^/]+?)(?:/html5player)?\.js',
 927                                     player_url,
 928                                     'html5 player', fatal=False)
 929                                 player_desc = 'html5 player %s' % player_version
 930
 931                         parts_sizes = self._signature_cache_id(encrypted_sig)
 932                         self.to_screen('{%s} signature length %s, %s' %
 933                             (format_id, parts_sizes, player_desc))
 934
 935                     signature = self._decrypt_signature(
 936                         encrypted_sig, video_id, player_url, age_gate)
 937                     url += '&signature=' + signature
 938                 if 'ratebypass' not in url:
 939                     url += '&ratebypass=yes'
 940                 url_map[format_id] = url
 941             formats = _map_to_format_list(url_map)
 942         elif video_info.get('hlsvp'):
 943             manifest_url = video_info['hlsvp'][0]
 944             url_map = self._extract_from_m3u8(manifest_url, video_id)
 945             formats = _map_to_format_list(url_map)
 946         else:
 947             raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
 948
 949         # Look for the DASH manifest
 950         if self._downloader.params.get('youtube_include_dash_manifest', True):
 951             try:
 952                 # The DASH manifest used needs to be the one from the original video_webpage.
 953                 # The one found in get_video_info seems to be using different signatures.
 954                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
 955                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
 956                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
 957                 if age_gate:
 958                     dash_manifest_url = video_info.get('dashmpd')[0]
 959                 else:
 960                     dash_manifest_url = ytplayer_config['args']['dashmpd']
 961                 def decrypt_sig(mobj):
 962                     s = mobj.group(1)
 963                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
 964                     return '/signature/%s' % dec_s
 965                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
 966                 dash_doc = self._download_xml(
 967                     dash_manifest_url, video_id,
 968                     note='Downloading DASH manifest',
 969                     errnote='Could not download DASH manifest')
 970                 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 971                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 972                     if url_el is None:
 973                         continue
 974                     format_id = r.attrib['id']
 975                     video_url = url_el.text
 976                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 977                     f = {
 978                         'format_id': format_id,
 979                         'url': video_url,
 980                         'width': int_or_none(r.attrib.get('width')),
 981                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
 982                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
 983                         'filesize': filesize,
 984                     }
 985                     try:
 986                         existing_format = next(
 987                             fo for fo in formats
 988                             if fo['format_id'] == format_id)
 989                     except StopIteration:
 990                         f.update(self._formats.get(format_id, {}))
 991                         formats.append(f)
 992                     else:
 993                         existing_format.update(f)
 994
 995             except (ExtractorError, KeyError) as e:
 996                 self.report_warning('Skipping DASH manifest: %s' % e, video_id)
 997
 998         self._sort_formats(formats)
 999
1000         return {
1001             'id':           video_id,
1002             'uploader':     video_uploader,
1003             'uploader_id':  video_uploader_id,
1004             'upload_date':  upload_date,
1005             'title':        video_title,
1006             'thumbnail':    video_thumbnail,
1007             'description':  video_description,
1008             'categories':   video_categories,
1009             'subtitles':    video_subtitles,
1010             'duration':     video_duration,
1011             'age_limit':    18 if age_gate else 0,
1012             'annotations':  video_annotations,
1013             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1014             'view_count':   view_count,
1015             'like_count': like_count,
1016             'dislike_count': dislike_count,
1017             'formats':      formats,
1018         }
1019
1020 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1021     IE_DESC = 'YouTube.com playlists'
1022     _VALID_URL = r"""(?x)(?:
1023                         (?:https?://)?
1024                         (?:\w+\.)?
1025                         youtube\.com/
1026                         (?:
1027                            (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1028                            \? (?:.*?&)*? (?:p|a|list)=
1029                         |  p/
1030                         )
1031                         (
1032                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1033                             # Top tracks, they can also include dots
1034                             |(?:MC)[\w\.]*
1035                         )
1036                         .*
1037                      |
1038                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1039                      )"""
1040     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1041     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1042     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1043     IE_NAME = 'youtube:playlist'
1044     _TESTS = [{
1045         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1046         'info_dict': {
1047             'title': 'ytdl test PL',
1048         },
1049         'playlist_count': 3,
1050     }, {
1051         'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1052         'info_dict': {
1053             'title': 'YDL_Empty_List',
1054         },
1055         'playlist_count': 0,
1056     }, {
1057         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1058         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1059         'info_dict': {
1060             'title': '29C3: Not my department',
1061         },
1062         'playlist_count': 95,
1063     }, {
1064         'note': 'issue #673',
1065         'url': 'PLBB231211A4F62143',
1066         'info_dict': {
1067             'title': '[OLD]Team Fortress 2 (Class-based LP)',
1068         },
1069         'playlist_mincount': 26,
1070     }, {
1071         'note': 'Large playlist',
1072         'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1073         'info_dict': {
1074             'title': 'Uploads from Cauchemar',
1075         },
1076         'playlist_mincount': 799,
1077     }, {
1078         'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1079         'info_dict': {
1080             'title': 'YDL_safe_search',
1081         },
1082         'playlist_count': 2,
1083     }, {
1084         'note': 'embedded',
1085         'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1086         'playlist_count': 4,
1087         'info_dict': {
1088             'title': 'JODA15',
1089         }
1090     }, {
1091         'note': 'Embedded SWF player',
1092         'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1093         'playlist_count': 4,
1094         'info_dict': {
1095             'title': 'JODA7',
1096         }
1097     }]
1098
1099     def _real_initialize(self):
1100         self._login()
1101
1102     def _ids_to_results(self, ids):
1103         return [
1104             self.url_result(vid_id, 'Youtube', video_id=vid_id)
1105             for vid_id in ids]
1106
1107     def _extract_mix(self, playlist_id):
1108         # The mixes are generated from a a single video
1109         # the id of the playlist is just 'RD' + video_id
1110         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1111         webpage = self._download_webpage(
1112             url, playlist_id, 'Downloading Youtube mix')
1113         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1114         title_span = (
1115             search_title('playlist-title') or
1116             search_title('title long-title') or
1117             search_title('title'))
1118         title = clean_html(title_span)
1119         ids = orderedSet(re.findall(
1120             r'''(?xs)data-video-username=".*?".*?
1121                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1122             webpage))
1123         url_results = self._ids_to_results(ids)
1124
1125         return self.playlist_result(url_results, playlist_id, title)
1126
1127     def _real_extract(self, url):
1128         # Extract playlist id
1129         mobj = re.match(self._VALID_URL, url)
1130         if mobj is None:
1131             raise ExtractorError('Invalid URL: %s' % url)
1132         playlist_id = mobj.group(1) or mobj.group(2)
1133
1134         # Check if it's a video-specific URL
1135         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1136         if 'v' in query_dict:
1137             video_id = query_dict['v'][0]
1138             if self._downloader.params.get('noplaylist'):
1139                 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1140                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1141             else:
1142                 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1143
1144         if playlist_id.startswith('RD'):
1145             # Mixes require a custom extraction process
1146             return self._extract_mix(playlist_id)
1147         if playlist_id.startswith('TL'):
1148             raise ExtractorError('For downloading YouTube.com top lists, use '
1149                 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1150
1151         url = self._TEMPLATE_URL % playlist_id
1152         page = self._download_webpage(url, playlist_id)
1153         more_widget_html = content_html = page
1154
1155         # Check if the playlist exists or is private
1156         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1157             raise ExtractorError(
1158                 'The playlist doesn\'t exist or is private, use --username or '
1159                 '--netrc to access it.',
1160                 expected=True)
1161
1162         # Extract the video ids from the playlist pages
1163         ids = []
1164
1165         for page_num in itertools.count(1):
1166             matches = re.finditer(self._VIDEO_RE, content_html)
1167             # We remove the duplicates and the link with index 0
1168             # (it's not the first video of the playlist)
1169             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1170             ids.extend(new_ids)
1171
1172             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1173             if not mobj:
1174                 break
1175
1176             more = self._download_json(
1177                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1178                 'Downloading page #%s' % page_num,
1179                 transform_source=uppercase_escape)
1180             content_html = more['content_html']
1181             more_widget_html = more['load_more_widget_html']
1182
1183         playlist_title = self._html_search_regex(
1184             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1185             page, 'title')
1186
1187         url_results = self._ids_to_results(ids)
1188         return self.playlist_result(url_results, playlist_id, playlist_title)
1189
1190
1191 class YoutubeTopListIE(YoutubePlaylistIE):
1192     IE_NAME = 'youtube:toplist'
1193     IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1194         ' (Example: "yttoplist:music:Top Tracks")')
1195     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1196     _TESTS = [{
1197         'url': 'yttoplist:music:Trending',
1198         'playlist_mincount': 5,
1199         'skip': 'Only works for logged-in users',
1200     }]
1201
1202     def _real_extract(self, url):
1203         mobj = re.match(self._VALID_URL, url)
1204         channel = mobj.group('chann')
1205         title = mobj.group('title')
1206         query = compat_urllib_parse.urlencode({'title': title})
1207         channel_page = self._download_webpage(
1208             'https://www.youtube.com/%s' % channel, title)
1209         link = self._html_search_regex(
1210             r'''(?x)
1211                 <a\s+href="([^"]+)".*?>\s*
1212                 <span\s+class="branded-page-module-title-text">\s*
1213                 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1214             channel_page, 'list')
1215         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1216
1217         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1218         ids = []
1219         # sometimes the webpage doesn't contain the videos
1220         # retry until we get them
1221         for i in itertools.count(0):
1222             msg = 'Downloading Youtube mix'
1223             if i > 0:
1224                 msg += ', retry #%d' % i
1225
1226             webpage = self._download_webpage(url, title, msg)
1227             ids = orderedSet(re.findall(video_re, webpage))
1228             if ids:
1229                 break
1230         url_results = self._ids_to_results(ids)
1231         return self.playlist_result(url_results, playlist_title=title)
1232
1233
1234 class YoutubeChannelIE(InfoExtractor):
1235     IE_DESC = 'YouTube.com channels'
1236     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1237     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1238     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1239     IE_NAME = 'youtube:channel'
1240     _TESTS = [{
1241         'note': 'paginated channel',
1242         'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1243         'playlist_mincount': 91,
1244     }]
1245
1246     def extract_videos_from_page(self, page):
1247         ids_in_page = []
1248         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1249             if mobj.group(1) not in ids_in_page:
1250                 ids_in_page.append(mobj.group(1))
1251         return ids_in_page
1252
1253     def _real_extract(self, url):
1254         # Extract channel id
1255         mobj = re.match(self._VALID_URL, url)
1256         if mobj is None:
1257             raise ExtractorError('Invalid URL: %s' % url)
1258
1259         # Download channel page
1260         channel_id = mobj.group(1)
1261         video_ids = []
1262         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1263         channel_page = self._download_webpage(url, channel_id)
1264         autogenerated = re.search(r'''(?x)
1265                 class="[^"]*?(?:
1266                     channel-header-autogenerated-label|
1267                     yt-channel-title-autogenerated
1268                 )[^"]*"''', channel_page) is not None
1269
1270         if autogenerated:
1271             # The videos are contained in a single page
1272             # the ajax pages can't be used, they are empty
1273             video_ids = self.extract_videos_from_page(channel_page)
1274         else:
1275             # Download all channel pages using the json-based channel_ajax query
1276             for pagenum in itertools.count(1):
1277                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1278                 page = self._download_json(
1279                     url, channel_id, note='Downloading page #%s' % pagenum,
1280                     transform_source=uppercase_escape)
1281
1282                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1283                 video_ids.extend(ids_in_page)
1284
1285                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1286                     break
1287
1288         self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1289
1290         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1291                        for video_id in video_ids]
1292         return self.playlist_result(url_entries, channel_id)
1293
1294
1295 class YoutubeUserIE(InfoExtractor):
1296     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1297     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1298     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1299     _GDATA_PAGE_SIZE = 50
1300     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1301     IE_NAME = 'youtube:user'
1302
1303     _TESTS = [{
1304         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1305         'playlist_mincount': 320,
1306         'info_dict': {
1307             'title': 'TheLinuxFoundation',
1308         }
1309     }, {
1310         'url': 'ytuser:phihag',
1311         'only_matching': True,
1312     }]
1313
1314     @classmethod
1315     def suitable(cls, url):
1316         # Don't return True if the url can be extracted with other youtube
1317         # extractor, the regex would is too permissive and it would match.
1318         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1319         if any(ie.suitable(url) for ie in other_ies): return False
1320         else: return super(YoutubeUserIE, cls).suitable(url)
1321
1322     def _real_extract(self, url):
1323         # Extract username
1324         mobj = re.match(self._VALID_URL, url)
1325         if mobj is None:
1326             raise ExtractorError('Invalid URL: %s' % url)
1327
1328         username = mobj.group(1)
1329
1330         # Download video ids using YouTube Data API. Result size per
1331         # query is limited (currently to 50 videos) so we need to query
1332         # page by page until there are no video ids - it means we got
1333         # all of them.
1334
1335         def download_page(pagenum):
1336             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1337
1338             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1339             page = self._download_webpage(
1340                 gdata_url, username,
1341                 'Downloading video ids from %d to %d' % (
1342                     start_index, start_index + self._GDATA_PAGE_SIZE))
1343
1344             try:
1345                 response = json.loads(page)
1346             except ValueError as err:
1347                 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1348             if 'entry' not in response['feed']:
1349                 return
1350
1351             # Extract video identifiers
1352             entries = response['feed']['entry']
1353             for entry in entries:
1354                 title = entry['title']['$t']
1355                 video_id = entry['id']['$t'].split('/')[-1]
1356                 yield {
1357                     '_type': 'url',
1358                     'url': video_id,
1359                     'ie_key': 'Youtube',
1360                     'id': video_id,
1361                     'title': title,
1362                 }
1363         url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1364
1365         return self.playlist_result(url_results, playlist_title=username)
1366
1367
1368 class YoutubeSearchIE(SearchInfoExtractor):
1369     IE_DESC = 'YouTube.com searches'
1370     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1371     _MAX_RESULTS = 1000
1372     IE_NAME = 'youtube:search'
1373     _SEARCH_KEY = 'ytsearch'
1374
1375     def _get_n_results(self, query, n):
1376         """Get a specified number of results for a query"""
1377
1378         video_ids = []
1379         pagenum = 0
1380         limit = n
1381         PAGE_SIZE = 50
1382
1383         while (PAGE_SIZE * pagenum) < limit:
1384             result_url = self._API_URL % (
1385                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1386                 (PAGE_SIZE * pagenum) + 1)
1387             data_json = self._download_webpage(
1388                 result_url, video_id='query "%s"' % query,
1389                 note='Downloading page %s' % (pagenum + 1),
1390                 errnote='Unable to download API page')
1391             data = json.loads(data_json)
1392             api_response = data['data']
1393
1394             if 'items' not in api_response:
1395                 raise ExtractorError(
1396                     '[youtube] No video results', expected=True)
1397
1398             new_ids = list(video['id'] for video in api_response['items'])
1399             video_ids += new_ids
1400
1401             limit = min(n, api_response['totalItems'])
1402             pagenum += 1
1403
1404         if len(video_ids) > n:
1405             video_ids = video_ids[:n]
1406         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1407                   for video_id in video_ids]
1408         return self.playlist_result(videos, query)
1409
1410
1411 class YoutubeSearchDateIE(YoutubeSearchIE):
1412     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1413     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1414     _SEARCH_KEY = 'ytsearchdate'
1415     IE_DESC = 'YouTube.com searches, newest videos first'
1416
1417
1418 class YoutubeSearchURLIE(InfoExtractor):
1419     IE_DESC = 'YouTube.com search URLs'
1420     IE_NAME = 'youtube:search_url'
1421     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1422     _TESTS = [{
1423         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1424         'playlist_mincount': 5,
1425         'info_dict': {
1426             'title': 'youtube-dl test video',
1427         }
1428     }]
1429
1430     def _real_extract(self, url):
1431         mobj = re.match(self._VALID_URL, url)
1432         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1433
1434         webpage = self._download_webpage(url, query)
1435         result_code = self._search_regex(
1436             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1437
1438         part_codes = re.findall(
1439             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1440         entries = []
1441         for part_code in part_codes:
1442             part_title = self._html_search_regex(
1443                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1444             part_url_snippet = self._html_search_regex(
1445                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1446             part_url = compat_urlparse.urljoin(
1447                 'https://www.youtube.com/', part_url_snippet)
1448             entries.append({
1449                 '_type': 'url',
1450                 'url': part_url,
1451                 'title': part_title,
1452             })
1453
1454         return {
1455             '_type': 'playlist',
1456             'entries': entries,
1457             'title': query,
1458         }
1459
1460
1461 class YoutubeShowIE(InfoExtractor):
1462     IE_DESC = 'YouTube.com (multi-season) shows'
1463     _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1464     IE_NAME = 'youtube:show'
1465     _TESTS = [{
1466         'url': 'http://www.youtube.com/show/airdisasters',
1467         'playlist_mincount': 3,
1468         'info_dict': {
1469             'id': 'airdisasters',
1470             'title': 'Air Disasters',
1471         }
1472     }]
1473
1474     def _real_extract(self, url):
1475         mobj = re.match(self._VALID_URL, url)
1476         playlist_id = mobj.group('id')
1477         webpage = self._download_webpage(
1478             url, playlist_id, 'Downloading show webpage')
1479         # There's one playlist for each season of the show
1480         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1481         self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1482         entries = [
1483             self.url_result(
1484                 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1485             for season in m_seasons
1486         ]
1487         title = self._og_search_title(webpage, fatal=False)
1488
1489         return {
1490             '_type': 'playlist',
1491             'id': playlist_id,
1492             'title': title,
1493             'entries': entries,
1494         }
1495
1496
1497 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1498     """
1499     Base class for extractors that fetch info from
1500     http://www.youtube.com/feed_ajax
1501     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1502     """
1503     _LOGIN_REQUIRED = True
1504     # use action_load_personal_feed instead of action_load_system_feed
1505     _PERSONAL_FEED = False
1506
1507     @property
1508     def _FEED_TEMPLATE(self):
1509         action = 'action_load_system_feed'
1510         if self._PERSONAL_FEED:
1511             action = 'action_load_personal_feed'
1512         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1513
1514     @property
1515     def IE_NAME(self):
1516         return 'youtube:%s' % self._FEED_NAME
1517
1518     def _real_initialize(self):
1519         self._login()
1520
1521     def _real_extract(self, url):
1522         feed_entries = []
1523         paging = 0
1524         for i in itertools.count(1):
1525             info = self._download_json(self._FEED_TEMPLATE % paging,
1526                                           '%s feed' % self._FEED_NAME,
1527                                           'Downloading page %s' % i)
1528             feed_html = info.get('feed_html') or info.get('content_html')
1529             load_more_widget_html = info.get('load_more_widget_html') or feed_html
1530             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1531             ids = orderedSet(m.group(1) for m in m_ids)
1532             feed_entries.extend(
1533                 self.url_result(video_id, 'Youtube', video_id=video_id)
1534                 for video_id in ids)
1535             mobj = re.search(
1536                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1537                 load_more_widget_html)
1538             if mobj is None:
1539                 break
1540             paging = mobj.group('paging')
1541         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1542
1543 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1544     IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1545     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1546     _FEED_NAME = 'recommended'
1547     _PLAYLIST_TITLE = 'Youtube Recommended videos'
1548
1549 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1550     IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1551     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1552     _FEED_NAME = 'watch_later'
1553     _PLAYLIST_TITLE = 'Youtube Watch Later'
1554     _PERSONAL_FEED = True
1555
1556 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1557     IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1558     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1559     _FEED_NAME = 'history'
1560     _PERSONAL_FEED = True
1561     _PLAYLIST_TITLE = 'Youtube Watch History'
1562
1563 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1564     IE_NAME = 'youtube:favorites'
1565     IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1566     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1567     _LOGIN_REQUIRED = True
1568
1569     def _real_extract(self, url):
1570         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1571         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1572         return self.url_result(playlist_id, 'YoutubePlaylist')
1573
1574
1575 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1576     IE_NAME = 'youtube:subscriptions'
1577     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1578     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1579     _TESTS = []
1580
1581     def _real_extract(self, url):
1582         title = 'Youtube Subscriptions'
1583         page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1584
1585         # The extraction process is the same as for playlists, but the regex
1586         # for the video ids doesn't contain an index
1587         ids = []
1588         more_widget_html = content_html = page
1589
1590         for page_num in itertools.count(1):
1591             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1592             new_ids = orderedSet(matches)
1593             ids.extend(new_ids)
1594
1595             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1596             if not mobj:
1597                 break
1598
1599             more = self._download_json(
1600                 'https://youtube.com/%s' % mobj.group('more'), title,
1601                 'Downloading page #%s' % page_num,
1602                 transform_source=uppercase_escape)
1603             content_html = more['content_html']
1604             more_widget_html = more['load_more_widget_html']
1605
1606         return {
1607             '_type': 'playlist',
1608             'title': title,
1609             'entries': self._ids_to_results(ids),
1610         }
1611
1612
1613 class YoutubeTruncatedURLIE(InfoExtractor):
1614     IE_NAME = 'youtube:truncated_url'
1615     IE_DESC = False  # Do not list
1616     _VALID_URL = r'''(?x)
1617         (?:https?://)?[^/]+/watch\?(?:
1618             feature=[a-z_]+|
1619             annotation_id=annotation_[^&]+
1620         )?$|
1621         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1622     '''
1623
1624     _TESTS = [{
1625         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1626         'only_matching': True,
1627     }, {
1628         'url': 'http://www.youtube.com/watch?',
1629         'only_matching': True,
1630     }]
1631
1632     def _real_extract(self, url):
1633         raise ExtractorError(
1634             'Did you forget to quote the URL? Remember that & is a meta '
1635             'character in most shells, so you want to put the URL in quotes, '
1636             'like  youtube-dl '
1637             '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1638             ' or simply  youtube-dl BaW_jenozKc  .',
1639             expected=True)