_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import time
  11 import traceback
  12
  13 from .common import InfoExtractor, SearchInfoExtractor
  14 from .subtitles import SubtitlesInfoExtractor
  15 from ..jsinterp import JSInterpreter
  16 from ..swfinterp import SWFInterpreter
  17 from ..utils import (
  18     compat_chr,
  19     compat_parse_qs,
  20     compat_urllib_parse,
  21     compat_urllib_request,
  22     compat_urlparse,
  23     compat_str,
  24
  25     clean_html,
  26     get_element_by_id,
  27     get_element_by_attribute,
  28     ExtractorError,
  29     int_or_none,
  30     OnDemandPagedList,
  31     unescapeHTML,
  32     unified_strdate,
  33     orderedSet,
  34     uppercase_escape,
  35 )
  36
  37
  38 class YoutubeBaseInfoExtractor(InfoExtractor):
  39     """Provide base functions for Youtube extractors"""
  40     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  41     _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
  42     _NETRC_MACHINE = 'youtube'
  43     # If True it will raise an error if no login info is provided
  44     _LOGIN_REQUIRED = False
  45
  46     def _set_language(self):
  47         self._set_cookie(
  48             '.youtube.com', 'PREF', 'f1=50000000&hl=en',
  49             # YouTube sets the expire time to about two months
  50             expire_time=time.time() + 2 * 30 * 24 * 3600)
  51
  52     def _login(self):
  53         """
  54         Attempt to log in to YouTube.
  55         True is returned if successful or skipped.
  56         False is returned if login failed.
  57
  58         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  59         """
  60         (username, password) = self._get_login_info()
  61         # No authentication to be performed
  62         if username is None:
  63             if self._LOGIN_REQUIRED:
  64                 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  65             return True
  66
  67         login_page = self._download_webpage(
  68             self._LOGIN_URL, None,
  69             note='Downloading login page',
  70             errnote='unable to fetch login page', fatal=False)
  71         if login_page is False:
  72             return
  73
  74         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  75                                   login_page, 'Login GALX parameter')
  76
  77         # Log in
  78         login_form_strs = {
  79             'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  80             'Email': username,
  81             'GALX': galx,
  82             'Passwd': password,
  83
  84             'PersistentCookie': 'yes',
  85             '_utf8': '霱',
  86             'bgresponse': 'js_disabled',
  87             'checkConnection': '',
  88             'checkedDomains': 'youtube',
  89             'dnConn': '',
  90             'pstMsg': '0',
  91             'rmShown': '1',
  92             'secTok': '',
  93             'signIn': 'Sign in',
  94             'timeStmp': '',
  95             'service': 'youtube',
  96             'uilel': '3',
  97             'hl': 'en_US',
  98         }
  99
 100         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 101         # chokes on unicode
 102         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
 103         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 104
 105         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 106         login_results = self._download_webpage(
 107             req, None,
 108             note='Logging in', errnote='unable to log in', fatal=False)
 109         if login_results is False:
 110             return False
 111
 112         if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 113             raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 114
 115         # Two-Factor
 116         # TODO add SMS and phone call support - these require making a request and then prompting the user
 117
 118         if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
 119             tfa_code = self._get_tfa_info()
 120
 121             if tfa_code is None:
 122                 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
 123                 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 124                 return False
 125
 126             # Unlike the first login form, secTok and timeStmp are both required for the TFA form
 127
 128             match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 129             if match is None:
 130                 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
 131             secTok = match.group(1)
 132             match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 133             if match is None:
 134                 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
 135             timeStmp = match.group(1)
 136
 137             tfa_form_strs = {
 138                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 139                 'smsToken': '',
 140                 'smsUserPin': tfa_code,
 141                 'smsVerifyPin': 'Verify',
 142
 143                 'PersistentCookie': 'yes',
 144                 'checkConnection': '',
 145                 'checkedDomains': 'youtube',
 146                 'pstMsg': '1',
 147                 'secTok': secTok,
 148                 'timeStmp': timeStmp,
 149                 'service': 'youtube',
 150                 'hl': 'en_US',
 151             }
 152             tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
 153             tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 154
 155             tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 156             tfa_results = self._download_webpage(
 157                 tfa_req, None,
 158                 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
 159
 160             if tfa_results is False:
 161                 return False
 162
 163             if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
 164                 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
 165                 return False
 166             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 167                 self._downloader.report_warning('unable to log in - did the page structure change?')
 168                 return False
 169             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 170                 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 171                 return False
 172
 173         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 174             self._downloader.report_warning('unable to log in: bad username or password')
 175             return False
 176         return True
 177
 178     def _real_initialize(self):
 179         if self._downloader is None:
 180             return
 181         self._set_language()
 182         if not self._login():
 183             return
 184
 185
 186 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 187     IE_DESC = 'YouTube.com'
 188     _VALID_URL = r"""(?x)^
 189                      (
 190                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
 191                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 192                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 193                             (?:www\.)?pwnyoutube\.com/|
 194                             (?:www\.)?yourepeat\.com/|
 195                             tube\.majestyc\.net/|
 196                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 197                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 198                          (?:                                                  # the various things that can precede the ID:
 199                              (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
 200                              |(?:                                             # or the v= param in all its forms
 201                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 202                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 203                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 204                                  v=
 205                              )
 206                          ))
 207                          |youtu\.be/                                          # just youtu.be/xxxx
 208                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 209                          )
 210                      )?                                                       # all until now is optional -> you can pass the naked ID
 211                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 212                      (?!.*?&list=)                                            # combined list/video URLs are handled by the playlist IE
 213                      (?(1).+)?                                                # if we found the ID, everything can follow
 214                      $"""
 215     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 216     _formats = {
 217         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 218         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 219         '13': {'ext': '3gp'},
 220         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 221         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 222         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 223         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 224         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 225         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 226         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 227         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 228         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 229         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 230         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 231         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 232
 233
 234         # 3d videos
 235         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 236         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 237         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 238         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 239         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 240         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 241         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 242
 243         # Apple HTTP Live Streaming
 244         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 245         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 246         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 247         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 248         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 249         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 250         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 251
 252         # DASH mp4 video
 253         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 254         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 255         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 256         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 257         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 258         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 259         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 260         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 261         '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 262         '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 263         '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
 264
 265         # Dash mp4 audio
 266         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 267         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 268         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 269
 270         # Dash webm
 271         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 272         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 273         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 274         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 275         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 276         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 277         '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
 278         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 279         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 280         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 281         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 282         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 283         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 284         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 285         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 286         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 287         '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 288         '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 289         '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
 290
 291         # Dash webm audio
 292         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
 293         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 294
 295         # Dash webm audio with opus inside
 296         '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
 297         '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
 298         '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
 299
 300         # RTMP (unnamed)
 301         '_rtmp': {'protocol': 'rtmp'},
 302     }
 303
 304     IE_NAME = 'youtube'
 305     _TESTS = [
 306         {
 307             'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
 308             'info_dict': {
 309                 'id': 'BaW_jenozKc',
 310                 'ext': 'mp4',
 311                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
 312                 'uploader': 'Philipp Hagemeister',
 313                 'uploader_id': 'phihag',
 314                 'upload_date': '20121002',
 315                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
 316                 'categories': ['Science & Technology'],
 317                 'like_count': int,
 318                 'dislike_count': int,
 319             }
 320         },
 321         {
 322             'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
 323             'note': 'Test generic use_cipher_signature video (#897)',
 324             'info_dict': {
 325                 'id': 'UxxajLWwzqY',
 326                 'ext': 'mp4',
 327                 'upload_date': '20120506',
 328                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
 329                 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
 330                 'uploader': 'Icona Pop',
 331                 'uploader_id': 'IconaPop',
 332             }
 333         },
 334         {
 335             'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
 336             'note': 'Test VEVO video with age protection (#956)',
 337             'info_dict': {
 338                 'id': '07FYdnEawAQ',
 339                 'ext': 'mp4',
 340                 'upload_date': '20130703',
 341                 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
 342                 'description': 'md5:64249768eec3bc4276236606ea996373',
 343                 'uploader': 'justintimberlakeVEVO',
 344                 'uploader_id': 'justintimberlakeVEVO',
 345             }
 346         },
 347         {
 348             'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
 349             'note': 'Embed-only video (#1746)',
 350             'info_dict': {
 351                 'id': 'yZIXLfi8CZQ',
 352                 'ext': 'mp4',
 353                 'upload_date': '20120608',
 354                 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
 355                 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
 356                 'uploader': 'SET India',
 357                 'uploader_id': 'setindia'
 358             }
 359         },
 360         {
 361             'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
 362             'note': '256k DASH audio (format 141) via DASH manifest',
 363             'info_dict': {
 364                 'id': 'a9LDPn-MO4I',
 365                 'ext': 'm4a',
 366                 'upload_date': '20121002',
 367                 'uploader_id': '8KVIDEO',
 368                 'description': '',
 369                 'uploader': '8KVIDEO',
 370                 'title': 'UHDTV TEST 8K VIDEO.mp4'
 371             },
 372             'params': {
 373                 'youtube_include_dash_manifest': True,
 374                 'format': '141',
 375             },
 376         },
 377         # DASH manifest with encrypted signature
 378         {
 379             'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 380             'info_dict': {
 381                 'id': 'IB3lcPjvWLA',
 382                 'ext': 'm4a',
 383                 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
 384                 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
 385                 'uploader': 'AfrojackVEVO',
 386                 'uploader_id': 'AfrojackVEVO',
 387                 'upload_date': '20131011',
 388             },
 389             'params': {
 390                 'youtube_include_dash_manifest': True,
 391                 'format': '141',
 392             },
 393         },
 394         # Controversy video
 395         {
 396             'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
 397             'info_dict': {
 398                 'id': 'T4XJQO3qol8',
 399                 'ext': 'mp4',
 400                 'upload_date': '20100909',
 401                 'uploader': 'The Amazing Atheist',
 402                 'uploader_id': 'TheAmazingAtheist',
 403                 'title': 'Burning Everyone\'s Koran',
 404                 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
 405             }
 406         },
 407         # Normal age-gate video (No vevo, embed allowed)
 408         {
 409             'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
 410             'info_dict': {
 411                 'id': 'HtVdAasjOgU',
 412                 'ext': 'mp4',
 413                 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
 414                 'description': 'md5:eca57043abae25130f58f655ad9a7771',
 415                 'uploader': 'The Witcher',
 416                 'uploader_id': 'WitcherGame',
 417                 'upload_date': '20140605',
 418             },
 419         },
 420         # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
 421         {
 422             'url': '__2ABJjxzNo',
 423             'info_dict': {
 424                 'id': '__2ABJjxzNo',
 425                 'ext': 'mp4',
 426                 'upload_date': '20100430',
 427                 'uploader_id': 'deadmau5',
 428                 'description': 'md5:12c56784b8032162bb936a5f76d55360',
 429                 'uploader': 'deadmau5',
 430                 'title': 'Deadmau5 - Some Chords (HD)',
 431             },
 432             'expected_warnings': [
 433                 'DASH manifest missing',
 434             ]
 435         }
 436     ]
 437
 438     def __init__(self, *args, **kwargs):
 439         super(YoutubeIE, self).__init__(*args, **kwargs)
 440         self._player_cache = {}
 441
 442     def report_video_info_webpage_download(self, video_id):
 443         """Report attempt to download video info webpage."""
 444         self.to_screen('%s: Downloading video info webpage' % video_id)
 445
 446     def report_information_extraction(self, video_id):
 447         """Report attempt to extract video information."""
 448         self.to_screen('%s: Extracting video information' % video_id)
 449
 450     def report_unavailable_format(self, video_id, format):
 451         """Report extracted video URL."""
 452         self.to_screen('%s: Format %s not available' % (video_id, format))
 453
 454     def report_rtmp_download(self):
 455         """Indicate the download will use the RTMP protocol."""
 456         self.to_screen('RTMP download detected')
 457
 458     def _signature_cache_id(self, example_sig):
 459         """ Return a string representation of a signature """
 460         return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 461
 462     def _extract_signature_function(self, video_id, player_url, example_sig):
 463         id_m = re.match(
 464             r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
 465             player_url)
 466         if not id_m:
 467             raise ExtractorError('Cannot identify player %r' % player_url)
 468         player_type = id_m.group('ext')
 469         player_id = id_m.group('id')
 470
 471         # Read from filesystem cache
 472         func_id = '%s_%s_%s' % (
 473             player_type, player_id, self._signature_cache_id(example_sig))
 474         assert os.path.basename(func_id) == func_id
 475
 476         cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
 477         if cache_spec is not None:
 478             return lambda s: ''.join(s[i] for i in cache_spec)
 479
 480         if player_type == 'js':
 481             code = self._download_webpage(
 482                 player_url, video_id,
 483                 note='Downloading %s player %s' % (player_type, player_id),
 484                 errnote='Download of %s failed' % player_url)
 485             res = self._parse_sig_js(code)
 486         elif player_type == 'swf':
 487             urlh = self._request_webpage(
 488                 player_url, video_id,
 489                 note='Downloading %s player %s' % (player_type, player_id),
 490                 errnote='Download of %s failed' % player_url)
 491             code = urlh.read()
 492             res = self._parse_sig_swf(code)
 493         else:
 494             assert False, 'Invalid player type %r' % player_type
 495
 496         if cache_spec is None:
 497             test_string = ''.join(map(compat_chr, range(len(example_sig))))
 498             cache_res = res(test_string)
 499             cache_spec = [ord(c) for c in cache_res]
 500
 501         self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
 502         return res
 503
 504     def _print_sig_code(self, func, example_sig):
 505         def gen_sig_code(idxs):
 506             def _genslice(start, end, step):
 507                 starts = '' if start == 0 else str(start)
 508                 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
 509                 steps = '' if step == 1 else (':%d' % step)
 510                 return 's[%s%s%s]' % (starts, ends, steps)
 511
 512             step = None
 513             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 514                                     # set as soon as step is set
 515             for i, prev in zip(idxs[1:], idxs[:-1]):
 516                 if step is not None:
 517                     if i - prev == step:
 518                         continue
 519                     yield _genslice(start, prev, step)
 520                     step = None
 521                     continue
 522                 if i - prev in [-1, 1]:
 523                     step = i - prev
 524                     start = prev
 525                     continue
 526                 else:
 527                     yield 's[%d]' % prev
 528             if step is None:
 529                 yield 's[%d]' % i
 530             else:
 531                 yield _genslice(start, i, step)
 532
 533         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 534         cache_res = func(test_string)
 535         cache_spec = [ord(c) for c in cache_res]
 536         expr_code = ' + '.join(gen_sig_code(cache_spec))
 537         signature_id_tuple = '(%s)' % (
 538             ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 539         code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 540                 '    return %s\n') % (signature_id_tuple, expr_code)
 541         self.to_screen('Extracted signature function:\n' + code)
 542
 543     def _parse_sig_js(self, jscode):
 544         funcname = self._search_regex(
 545             r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
 546             'Initial JS player signature function name')
 547
 548         jsi = JSInterpreter(jscode)
 549         initial_function = jsi.extract_function(funcname)
 550         return lambda s: initial_function([s])
 551
 552     def _parse_sig_swf(self, file_contents):
 553         swfi = SWFInterpreter(file_contents)
 554         TARGET_CLASSNAME = 'SignatureDecipher'
 555         searched_class = swfi.extract_class(TARGET_CLASSNAME)
 556         initial_function = swfi.extract_function(searched_class, 'decipher')
 557         return lambda s: initial_function([s])
 558
 559     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 560         """Turn the encrypted s field into a working signature"""
 561
 562         if player_url is None:
 563             raise ExtractorError('Cannot decrypt signature without player_url')
 564
 565         if player_url.startswith('//'):
 566             player_url = 'https:' + player_url
 567         try:
 568             player_id = (player_url, self._signature_cache_id(s))
 569             if player_id not in self._player_cache:
 570                 func = self._extract_signature_function(
 571                     video_id, player_url, s
 572                 )
 573                 self._player_cache[player_id] = func
 574             func = self._player_cache[player_id]
 575             if self._downloader.params.get('youtube_print_sig_code'):
 576                 self._print_sig_code(func, s)
 577             return func(s)
 578         except Exception as e:
 579             tb = traceback.format_exc()
 580             raise ExtractorError(
 581                 'Signature extraction failed: ' + tb, cause=e)
 582
 583     def _get_available_subtitles(self, video_id, webpage):
 584         try:
 585             sub_list = self._download_webpage(
 586                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 587                 video_id, note=False)
 588         except ExtractorError as err:
 589             self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
 590             return {}
 591         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 592
 593         sub_lang_list = {}
 594         for l in lang_list:
 595             lang = l[1]
 596             if lang in sub_lang_list:
 597                 continue
 598             params = compat_urllib_parse.urlencode({
 599                 'lang': lang,
 600                 'v': video_id,
 601                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 602                 'name': unescapeHTML(l[0]).encode('utf-8'),
 603             })
 604             url = 'https://www.youtube.com/api/timedtext?' + params
 605             sub_lang_list[lang] = url
 606         if not sub_lang_list:
 607             self._downloader.report_warning('video doesn\'t have subtitles')
 608             return {}
 609         return sub_lang_list
 610
 611     def _get_available_automatic_caption(self, video_id, webpage):
 612         """We need the webpage for getting the captions url, pass it as an
 613            argument to speed up the process."""
 614         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 615         self.to_screen('%s: Looking for automatic captions' % video_id)
 616         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 617         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
 618         if mobj is None:
 619             self._downloader.report_warning(err_msg)
 620             return {}
 621         player_config = json.loads(mobj.group(1))
 622         try:
 623             args = player_config['args']
 624             caption_url = args['ttsurl']
 625             timestamp = args['timestamp']
 626             # We get the available subtitles
 627             list_params = compat_urllib_parse.urlencode({
 628                 'type': 'list',
 629                 'tlangs': 1,
 630                 'asrs': 1,
 631             })
 632             list_url = caption_url + '&' + list_params
 633             caption_list = self._download_xml(list_url, video_id)
 634             original_lang_node = caption_list.find('track')
 635             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
 636                 self._downloader.report_warning('Video doesn\'t have automatic captions')
 637                 return {}
 638             original_lang = original_lang_node.attrib['lang_code']
 639
 640             sub_lang_list = {}
 641             for lang_node in caption_list.findall('target'):
 642                 sub_lang = lang_node.attrib['lang_code']
 643                 params = compat_urllib_parse.urlencode({
 644                     'lang': original_lang,
 645                     'tlang': sub_lang,
 646                     'fmt': sub_format,
 647                     'ts': timestamp,
 648                     'kind': 'asr',
 649                 })
 650                 sub_lang_list[sub_lang] = caption_url + '&' + params
 651             return sub_lang_list
 652         # An extractor error can be raise by the download process if there are
 653         # no automatic captions but there are subtitles
 654         except (KeyError, ExtractorError):
 655             self._downloader.report_warning(err_msg)
 656             return {}
 657
 658     @classmethod
 659     def extract_id(cls, url):
 660         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 661         if mobj is None:
 662             raise ExtractorError('Invalid URL: %s' % url)
 663         video_id = mobj.group(2)
 664         return video_id
 665
 666     def _extract_from_m3u8(self, manifest_url, video_id):
 667         url_map = {}
 668
 669         def _get_urls(_manifest):
 670             lines = _manifest.split('\n')
 671             urls = filter(lambda l: l and not l.startswith('#'),
 672                           lines)
 673             return urls
 674         manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
 675         formats_urls = _get_urls(manifest)
 676         for format_url in formats_urls:
 677             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 678             url_map[itag] = format_url
 679         return url_map
 680
 681     def _extract_annotations(self, video_id):
 682         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 683         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 684
 685     def _parse_dash_manifest(
 686             self, video_id, dash_manifest_url, player_url, age_gate):
 687         def decrypt_sig(mobj):
 688             s = mobj.group(1)
 689             dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
 690             return '/signature/%s' % dec_s
 691         dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
 692         dash_doc = self._download_xml(
 693             dash_manifest_url, video_id,
 694             note='Downloading DASH manifest',
 695             errnote='Could not download DASH manifest')
 696
 697         formats = []
 698         for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 699             url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 700             if url_el is None:
 701                 continue
 702             format_id = r.attrib['id']
 703             video_url = url_el.text
 704             filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 705             f = {
 706                 'format_id': format_id,
 707                 'url': video_url,
 708                 'width': int_or_none(r.attrib.get('width')),
 709                 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
 710                 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
 711                 'filesize': filesize,
 712                 'fps': int_or_none(r.attrib.get('frameRate')),
 713             }
 714             try:
 715                 existing_format = next(
 716                     fo for fo in formats
 717                     if fo['format_id'] == format_id)
 718             except StopIteration:
 719                 f.update(self._formats.get(format_id, {}))
 720                 formats.append(f)
 721             else:
 722                 existing_format.update(f)
 723         return formats
 724
 725     def _real_extract(self, url):
 726         proto = (
 727             'http' if self._downloader.params.get('prefer_insecure', False)
 728             else 'https')
 729
 730         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 731         mobj = re.search(self._NEXT_URL_RE, url)
 732         if mobj:
 733             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 734         video_id = self.extract_id(url)
 735
 736         # Get video webpage
 737         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
 738         video_webpage = self._download_webpage(url, video_id)
 739
 740         # Attempt to extract SWF player URL
 741         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 742         if mobj is not None:
 743             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 744         else:
 745             player_url = None
 746
 747         # Get video info
 748         if re.search(r'player-age-gate-content">', video_webpage) is not None:
 749             age_gate = True
 750             # We simulate the access to the video from www.youtube.com/v/{video_id}
 751             # this can be viewed without login into Youtube
 752             data = compat_urllib_parse.urlencode({
 753                 'video_id': video_id,
 754                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
 755                 'sts': self._search_regex(
 756                     r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
 757             })
 758             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
 759             video_info_webpage = self._download_webpage(
 760                 video_info_url, video_id,
 761                 note='Refetching age-gated info webpage',
 762                 errnote='unable to download video info webpage')
 763             video_info = compat_parse_qs(video_info_webpage)
 764         else:
 765             age_gate = False
 766             try:
 767                 # Try looking directly into the video webpage
 768                 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
 769                 if not mobj:
 770                     raise ValueError('Could not find ytplayer.config')  # caught below
 771                 json_code = uppercase_escape(mobj.group(1))
 772                 ytplayer_config = json.loads(json_code)
 773                 args = ytplayer_config['args']
 774                 # Convert to the same format returned by compat_parse_qs
 775                 video_info = dict((k, [v]) for k, v in args.items())
 776                 if 'url_encoded_fmt_stream_map' not in args:
 777                     raise ValueError('No stream_map present')  # caught below
 778             except ValueError:
 779                 # We fallback to the get_video_info pages (used by the embed page)
 780                 self.report_video_info_webpage_download(video_id)
 781                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 782                     video_info_url = (
 783                         '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 784                         % (proto, video_id, el_type))
 785                     video_info_webpage = self._download_webpage(
 786                         video_info_url,
 787                         video_id, note=False,
 788                         errnote='unable to download video info webpage')
 789                     video_info = compat_parse_qs(video_info_webpage)
 790                     if 'token' in video_info:
 791                         break
 792         if 'token' not in video_info:
 793             if 'reason' in video_info:
 794                 raise ExtractorError(
 795                     'YouTube said: %s' % video_info['reason'][0],
 796                     expected=True, video_id=video_id)
 797             else:
 798                 raise ExtractorError(
 799                     '"token" parameter not in video info for unknown reason',
 800                     video_id=video_id)
 801
 802         if 'view_count' in video_info:
 803             view_count = int(video_info['view_count'][0])
 804         else:
 805             view_count = None
 806
 807         # Check for "rental" videos
 808         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 809             raise ExtractorError('"rental" videos not supported')
 810
 811         # Start extracting information
 812         self.report_information_extraction(video_id)
 813
 814         # uploader
 815         if 'author' not in video_info:
 816             raise ExtractorError('Unable to extract uploader name')
 817         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 818
 819         # uploader_id
 820         video_uploader_id = None
 821         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 822         if mobj is not None:
 823             video_uploader_id = mobj.group(1)
 824         else:
 825             self._downloader.report_warning('unable to extract uploader nickname')
 826
 827         # title
 828         if 'title' in video_info:
 829             video_title = video_info['title'][0]
 830         else:
 831             self._downloader.report_warning('Unable to extract video title')
 832             video_title = '_'
 833
 834         # thumbnail image
 835         # We try first to get a high quality image:
 836         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
 837                             video_webpage, re.DOTALL)
 838         if m_thumb is not None:
 839             video_thumbnail = m_thumb.group(1)
 840         elif 'thumbnail_url' not in video_info:
 841             self._downloader.report_warning('unable to extract video thumbnail')
 842             video_thumbnail = None
 843         else:   # don't panic if we can't find it
 844             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 845
 846         # upload date
 847         upload_date = None
 848         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
 849         if mobj is None:
 850             mobj = re.search(
 851                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
 852                 video_webpage)
 853         if mobj is not None:
 854             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 855             upload_date = unified_strdate(upload_date)
 856
 857         m_cat_container = self._search_regex(
 858             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
 859             video_webpage, 'categories', fatal=False)
 860         if m_cat_container:
 861             category = self._html_search_regex(
 862                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
 863                 default=None)
 864             video_categories = None if category is None else [category]
 865         else:
 866             video_categories = None
 867
 868         # description
 869         video_description = get_element_by_id("eow-description", video_webpage)
 870         if video_description:
 871             video_description = re.sub(r'''(?x)
 872                 <a\s+
 873                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 874                     title="([^"]+)"\s+
 875                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 876                     class="yt-uix-redirect-link"\s*>
 877                 [^<]+
 878                 </a>
 879             ''', r'\1', video_description)
 880             video_description = clean_html(video_description)
 881         else:
 882             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 883             if fd_mobj:
 884                 video_description = unescapeHTML(fd_mobj.group(1))
 885             else:
 886                 video_description = ''
 887
 888         def _extract_count(count_name):
 889             count = self._search_regex(
 890                 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
 891                 video_webpage, count_name, default=None)
 892             if count is not None:
 893                 return int(count.replace(',', ''))
 894             return None
 895         like_count = _extract_count('like')
 896         dislike_count = _extract_count('dislike')
 897
 898         # subtitles
 899         video_subtitles = self.extract_subtitles(video_id, video_webpage)
 900
 901         if self._downloader.params.get('listsubtitles', False):
 902             self._list_available_subtitles(video_id, video_webpage)
 903             return
 904
 905         if 'length_seconds' not in video_info:
 906             self._downloader.report_warning('unable to extract video duration')
 907             video_duration = None
 908         else:
 909             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
 910
 911         # annotations
 912         video_annotations = None
 913         if self._downloader.params.get('writeannotations', False):
 914             video_annotations = self._extract_annotations(video_id)
 915
 916         def _map_to_format_list(urlmap):
 917             formats = []
 918             for itag, video_real_url in urlmap.items():
 919                 dct = {
 920                     'format_id': itag,
 921                     'url': video_real_url,
 922                     'player_url': player_url,
 923                 }
 924                 if itag in self._formats:
 925                     dct.update(self._formats[itag])
 926                 formats.append(dct)
 927             return formats
 928
 929         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 930             self.report_rtmp_download()
 931             formats = [{
 932                 'format_id': '_rtmp',
 933                 'protocol': 'rtmp',
 934                 'url': video_info['conn'][0],
 935                 'player_url': player_url,
 936             }]
 937         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
 938             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
 939             if 'rtmpe%3Dyes' in encoded_url_map:
 940                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
 941             url_map = {}
 942             for url_data_str in encoded_url_map.split(','):
 943                 url_data = compat_parse_qs(url_data_str)
 944                 if 'itag' not in url_data or 'url' not in url_data:
 945                     continue
 946                 format_id = url_data['itag'][0]
 947                 url = url_data['url'][0]
 948
 949                 if 'sig' in url_data:
 950                     url += '&signature=' + url_data['sig'][0]
 951                 elif 's' in url_data:
 952                     encrypted_sig = url_data['s'][0]
 953
 954                     if not age_gate:
 955                         jsplayer_url_json = self._search_regex(
 956                             r'"assets":.+?"js":\s*("[^"]+")',
 957                             video_webpage, 'JS player URL')
 958                         player_url = json.loads(jsplayer_url_json)
 959                     if player_url is None:
 960                         player_url_json = self._search_regex(
 961                             r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
 962                             video_webpage, 'age gate player URL')
 963                         player_url = json.loads(player_url_json)
 964
 965                     if self._downloader.params.get('verbose'):
 966                         if player_url is None:
 967                             player_version = 'unknown'
 968                             player_desc = 'unknown'
 969                         else:
 970                             if player_url.endswith('swf'):
 971                                 player_version = self._search_regex(
 972                                     r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
 973                                     'flash player', fatal=False)
 974                                 player_desc = 'flash player %s' % player_version
 975                             else:
 976                                 player_version = self._search_regex(
 977                                     r'html5player-([^/]+?)(?:/html5player)?\.js',
 978                                     player_url,
 979                                     'html5 player', fatal=False)
 980                                 player_desc = 'html5 player %s' % player_version
 981
 982                         parts_sizes = self._signature_cache_id(encrypted_sig)
 983                         self.to_screen('{%s} signature length %s, %s' %
 984                                        (format_id, parts_sizes, player_desc))
 985
 986                     signature = self._decrypt_signature(
 987                         encrypted_sig, video_id, player_url, age_gate)
 988                     url += '&signature=' + signature
 989                 if 'ratebypass' not in url:
 990                     url += '&ratebypass=yes'
 991                 url_map[format_id] = url
 992             formats = _map_to_format_list(url_map)
 993         elif video_info.get('hlsvp'):
 994             manifest_url = video_info['hlsvp'][0]
 995             url_map = self._extract_from_m3u8(manifest_url, video_id)
 996             formats = _map_to_format_list(url_map)
 997         else:
 998             raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
 999
1000         # Look for the DASH manifest
1001         if self._downloader.params.get('youtube_include_dash_manifest', True):
1002             dash_mpd = video_info.get('dashmpd')
1003             if not dash_mpd:
1004                 self.report_warning('%s: DASH manifest missing' % video_id)
1005             else:
1006                 dash_manifest_url = dash_mpd[0]
1007                 try:
1008                     dash_formats = self._parse_dash_manifest(
1009                         video_id, dash_manifest_url, player_url, age_gate)
1010                 except (ExtractorError, KeyError) as e:
1011                     self.report_warning(
1012                         'Skipping DASH manifest: %r' % e, video_id)
1013                 else:
1014                     formats.extend(dash_formats)
1015
1016         self._sort_formats(formats)
1017
1018         return {
1019             'id': video_id,
1020             'uploader': video_uploader,
1021             'uploader_id': video_uploader_id,
1022             'upload_date': upload_date,
1023             'title': video_title,
1024             'thumbnail': video_thumbnail,
1025             'description': video_description,
1026             'categories': video_categories,
1027             'subtitles': video_subtitles,
1028             'duration': video_duration,
1029             'age_limit': 18 if age_gate else 0,
1030             'annotations': video_annotations,
1031             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1032             'view_count': view_count,
1033             'like_count': like_count,
1034             'dislike_count': dislike_count,
1035             'formats': formats,
1036         }
1037
1038
1039 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1040     IE_DESC = 'YouTube.com playlists'
1041     _VALID_URL = r"""(?x)(?:
1042                         (?:https?://)?
1043                         (?:\w+\.)?
1044                         youtube\.com/
1045                         (?:
1046                            (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1047                            \? (?:.*?&)*? (?:p|a|list)=
1048                         |  p/
1049                         )
1050                         (
1051                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1052                             # Top tracks, they can also include dots
1053                             |(?:MC)[\w\.]*
1054                         )
1055                         .*
1056                      |
1057                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1058                      )"""
1059     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1060     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1061     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1062     IE_NAME = 'youtube:playlist'
1063     _TESTS = [{
1064         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1065         'info_dict': {
1066             'title': 'ytdl test PL',
1067             'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1068         },
1069         'playlist_count': 3,
1070     }, {
1071         'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1072         'info_dict': {
1073             'title': 'YDL_Empty_List',
1074         },
1075         'playlist_count': 0,
1076     }, {
1077         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1078         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1079         'info_dict': {
1080             'title': '29C3: Not my department',
1081         },
1082         'playlist_count': 95,
1083     }, {
1084         'note': 'issue #673',
1085         'url': 'PLBB231211A4F62143',
1086         'info_dict': {
1087             'title': '[OLD]Team Fortress 2 (Class-based LP)',
1088         },
1089         'playlist_mincount': 26,
1090     }, {
1091         'note': 'Large playlist',
1092         'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1093         'info_dict': {
1094             'title': 'Uploads from Cauchemar',
1095         },
1096         'playlist_mincount': 799,
1097     }, {
1098         'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1099         'info_dict': {
1100             'title': 'YDL_safe_search',
1101         },
1102         'playlist_count': 2,
1103     }, {
1104         'note': 'embedded',
1105         'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1106         'playlist_count': 4,
1107         'info_dict': {
1108             'title': 'JODA15',
1109         }
1110     }, {
1111         'note': 'Embedded SWF player',
1112         'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1113         'playlist_count': 4,
1114         'info_dict': {
1115             'title': 'JODA7',
1116         }
1117     }]
1118
1119     def _real_initialize(self):
1120         self._login()
1121
1122     def _ids_to_results(self, ids):
1123         return [
1124             self.url_result(vid_id, 'Youtube', video_id=vid_id)
1125             for vid_id in ids]
1126
1127     def _extract_mix(self, playlist_id):
1128         # The mixes are generated from a a single video
1129         # the id of the playlist is just 'RD' + video_id
1130         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1131         webpage = self._download_webpage(
1132             url, playlist_id, 'Downloading Youtube mix')
1133         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1134         title_span = (
1135             search_title('playlist-title') or
1136             search_title('title long-title') or
1137             search_title('title'))
1138         title = clean_html(title_span)
1139         ids = orderedSet(re.findall(
1140             r'''(?xs)data-video-username=".*?".*?
1141                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1142             webpage))
1143         url_results = self._ids_to_results(ids)
1144
1145         return self.playlist_result(url_results, playlist_id, title)
1146
1147     def _real_extract(self, url):
1148         # Extract playlist id
1149         mobj = re.match(self._VALID_URL, url)
1150         if mobj is None:
1151             raise ExtractorError('Invalid URL: %s' % url)
1152         playlist_id = mobj.group(1) or mobj.group(2)
1153
1154         # Check if it's a video-specific URL
1155         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1156         if 'v' in query_dict:
1157             video_id = query_dict['v'][0]
1158             if self._downloader.params.get('noplaylist'):
1159                 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1160                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1161             else:
1162                 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1163
1164         if playlist_id.startswith('RD'):
1165             # Mixes require a custom extraction process
1166             return self._extract_mix(playlist_id)
1167         if playlist_id.startswith('TL'):
1168             raise ExtractorError('For downloading YouTube.com top lists, use '
1169                                  'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1170
1171         url = self._TEMPLATE_URL % playlist_id
1172         page = self._download_webpage(url, playlist_id)
1173         more_widget_html = content_html = page
1174
1175         # Check if the playlist exists or is private
1176         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1177             raise ExtractorError(
1178                 'The playlist doesn\'t exist or is private, use --username or '
1179                 '--netrc to access it.',
1180                 expected=True)
1181
1182         # Extract the video ids from the playlist pages
1183         ids = []
1184
1185         for page_num in itertools.count(1):
1186             matches = re.finditer(self._VIDEO_RE, content_html)
1187             # We remove the duplicates and the link with index 0
1188             # (it's not the first video of the playlist)
1189             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1190             ids.extend(new_ids)
1191
1192             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1193             if not mobj:
1194                 break
1195
1196             more = self._download_json(
1197                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1198                 'Downloading page #%s' % page_num,
1199                 transform_source=uppercase_escape)
1200             content_html = more['content_html']
1201             more_widget_html = more['load_more_widget_html']
1202
1203         playlist_title = self._html_search_regex(
1204             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1205             page, 'title')
1206
1207         url_results = self._ids_to_results(ids)
1208         return self.playlist_result(url_results, playlist_id, playlist_title)
1209
1210
1211 class YoutubeTopListIE(YoutubePlaylistIE):
1212     IE_NAME = 'youtube:toplist'
1213     IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1214                ' (Example: "yttoplist:music:Top Tracks")')
1215     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1216     _TESTS = [{
1217         'url': 'yttoplist:music:Trending',
1218         'playlist_mincount': 5,
1219         'skip': 'Only works for logged-in users',
1220     }]
1221
1222     def _real_extract(self, url):
1223         mobj = re.match(self._VALID_URL, url)
1224         channel = mobj.group('chann')
1225         title = mobj.group('title')
1226         query = compat_urllib_parse.urlencode({'title': title})
1227         channel_page = self._download_webpage(
1228             'https://www.youtube.com/%s' % channel, title)
1229         link = self._html_search_regex(
1230             r'''(?x)
1231                 <a\s+href="([^"]+)".*?>\s*
1232                 <span\s+class="branded-page-module-title-text">\s*
1233                 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1234             channel_page, 'list')
1235         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1236
1237         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1238         ids = []
1239         # sometimes the webpage doesn't contain the videos
1240         # retry until we get them
1241         for i in itertools.count(0):
1242             msg = 'Downloading Youtube mix'
1243             if i > 0:
1244                 msg += ', retry #%d' % i
1245
1246             webpage = self._download_webpage(url, title, msg)
1247             ids = orderedSet(re.findall(video_re, webpage))
1248             if ids:
1249                 break
1250         url_results = self._ids_to_results(ids)
1251         return self.playlist_result(url_results, playlist_title=title)
1252
1253
1254 class YoutubeChannelIE(InfoExtractor):
1255     IE_DESC = 'YouTube.com channels'
1256     _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1257     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1258     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1259     IE_NAME = 'youtube:channel'
1260     _TESTS = [{
1261         'note': 'paginated channel',
1262         'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1263         'playlist_mincount': 91,
1264     }]
1265
1266     def extract_videos_from_page(self, page):
1267         ids_in_page = []
1268         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1269             if mobj.group(1) not in ids_in_page:
1270                 ids_in_page.append(mobj.group(1))
1271         return ids_in_page
1272
1273     def _real_extract(self, url):
1274         channel_id = self._match_id(url)
1275
1276         video_ids = []
1277         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1278         channel_page = self._download_webpage(url, channel_id)
1279         autogenerated = re.search(r'''(?x)
1280                 class="[^"]*?(?:
1281                     channel-header-autogenerated-label|
1282                     yt-channel-title-autogenerated
1283                 )[^"]*"''', channel_page) is not None
1284
1285         if autogenerated:
1286             # The videos are contained in a single page
1287             # the ajax pages can't be used, they are empty
1288             video_ids = self.extract_videos_from_page(channel_page)
1289             entries = [
1290                 self.url_result(video_id, 'Youtube', video_id=video_id)
1291                 for video_id in video_ids]
1292             return self.playlist_result(entries, channel_id)
1293
1294         def _entries():
1295             for pagenum in itertools.count(1):
1296                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1297                 page = self._download_json(
1298                     url, channel_id, note='Downloading page #%s' % pagenum,
1299                     transform_source=uppercase_escape)
1300
1301                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1302                 for video_id in ids_in_page:
1303                     yield self.url_result(
1304                         video_id, 'Youtube', video_id=video_id)
1305
1306                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1307                     break
1308
1309         return self.playlist_result(_entries(), channel_id)
1310
1311
1312 class YoutubeUserIE(InfoExtractor):
1313     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1314     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1315     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1316     _GDATA_PAGE_SIZE = 50
1317     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1318     IE_NAME = 'youtube:user'
1319
1320     _TESTS = [{
1321         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1322         'playlist_mincount': 320,
1323         'info_dict': {
1324             'title': 'TheLinuxFoundation',
1325         }
1326     }, {
1327         'url': 'ytuser:phihag',
1328         'only_matching': True,
1329     }]
1330
1331     @classmethod
1332     def suitable(cls, url):
1333         # Don't return True if the url can be extracted with other youtube
1334         # extractor, the regex would is too permissive and it would match.
1335         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1336         if any(ie.suitable(url) for ie in other_ies):
1337             return False
1338         else:
1339             return super(YoutubeUserIE, cls).suitable(url)
1340
1341     def _real_extract(self, url):
1342         username = self._match_id(url)
1343
1344         # Download video ids using YouTube Data API. Result size per
1345         # query is limited (currently to 50 videos) so we need to query
1346         # page by page until there are no video ids - it means we got
1347         # all of them.
1348
1349         def download_page(pagenum):
1350             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1351
1352             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1353             page = self._download_webpage(
1354                 gdata_url, username,
1355                 'Downloading video ids from %d to %d' % (
1356                     start_index, start_index + self._GDATA_PAGE_SIZE))
1357
1358             try:
1359                 response = json.loads(page)
1360             except ValueError as err:
1361                 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1362             if 'entry' not in response['feed']:
1363                 return
1364
1365             # Extract video identifiers
1366             entries = response['feed']['entry']
1367             for entry in entries:
1368                 title = entry['title']['$t']
1369                 video_id = entry['id']['$t'].split('/')[-1]
1370                 yield {
1371                     '_type': 'url',
1372                     'url': video_id,
1373                     'ie_key': 'Youtube',
1374                     'id': video_id,
1375                     'title': title,
1376                 }
1377         url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1378
1379         return self.playlist_result(url_results, playlist_title=username)
1380
1381
1382 class YoutubeSearchIE(SearchInfoExtractor):
1383     IE_DESC = 'YouTube.com searches'
1384     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1385     _MAX_RESULTS = 1000
1386     IE_NAME = 'youtube:search'
1387     _SEARCH_KEY = 'ytsearch'
1388
1389     def _get_n_results(self, query, n):
1390         """Get a specified number of results for a query"""
1391
1392         video_ids = []
1393         pagenum = 0
1394         limit = n
1395         PAGE_SIZE = 50
1396
1397         while (PAGE_SIZE * pagenum) < limit:
1398             result_url = self._API_URL % (
1399                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1400                 (PAGE_SIZE * pagenum) + 1)
1401             data_json = self._download_webpage(
1402                 result_url, video_id='query "%s"' % query,
1403                 note='Downloading page %s' % (pagenum + 1),
1404                 errnote='Unable to download API page')
1405             data = json.loads(data_json)
1406             api_response = data['data']
1407
1408             if 'items' not in api_response:
1409                 raise ExtractorError(
1410                     '[youtube] No video results', expected=True)
1411
1412             new_ids = list(video['id'] for video in api_response['items'])
1413             video_ids += new_ids
1414
1415             limit = min(n, api_response['totalItems'])
1416             pagenum += 1
1417
1418         if len(video_ids) > n:
1419             video_ids = video_ids[:n]
1420         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1421                   for video_id in video_ids]
1422         return self.playlist_result(videos, query)
1423
1424
1425 class YoutubeSearchDateIE(YoutubeSearchIE):
1426     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1427     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1428     _SEARCH_KEY = 'ytsearchdate'
1429     IE_DESC = 'YouTube.com searches, newest videos first'
1430
1431
1432 class YoutubeSearchURLIE(InfoExtractor):
1433     IE_DESC = 'YouTube.com search URLs'
1434     IE_NAME = 'youtube:search_url'
1435     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1436     _TESTS = [{
1437         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1438         'playlist_mincount': 5,
1439         'info_dict': {
1440             'title': 'youtube-dl test video',
1441         }
1442     }]
1443
1444     def _real_extract(self, url):
1445         mobj = re.match(self._VALID_URL, url)
1446         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1447
1448         webpage = self._download_webpage(url, query)
1449         result_code = self._search_regex(
1450             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1451
1452         part_codes = re.findall(
1453             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1454         entries = []
1455         for part_code in part_codes:
1456             part_title = self._html_search_regex(
1457                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1458             part_url_snippet = self._html_search_regex(
1459                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1460             part_url = compat_urlparse.urljoin(
1461                 'https://www.youtube.com/', part_url_snippet)
1462             entries.append({
1463                 '_type': 'url',
1464                 'url': part_url,
1465                 'title': part_title,
1466             })
1467
1468         return {
1469             '_type': 'playlist',
1470             'entries': entries,
1471             'title': query,
1472         }
1473
1474
1475 class YoutubeShowIE(InfoExtractor):
1476     IE_DESC = 'YouTube.com (multi-season) shows'
1477     _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1478     IE_NAME = 'youtube:show'
1479     _TESTS = [{
1480         'url': 'http://www.youtube.com/show/airdisasters',
1481         'playlist_mincount': 3,
1482         'info_dict': {
1483             'id': 'airdisasters',
1484             'title': 'Air Disasters',
1485         }
1486     }]
1487
1488     def _real_extract(self, url):
1489         mobj = re.match(self._VALID_URL, url)
1490         playlist_id = mobj.group('id')
1491         webpage = self._download_webpage(
1492             url, playlist_id, 'Downloading show webpage')
1493         # There's one playlist for each season of the show
1494         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1495         self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1496         entries = [
1497             self.url_result(
1498                 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1499             for season in m_seasons
1500         ]
1501         title = self._og_search_title(webpage, fatal=False)
1502
1503         return {
1504             '_type': 'playlist',
1505             'id': playlist_id,
1506             'title': title,
1507             'entries': entries,
1508         }
1509
1510
1511 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1512     """
1513     Base class for extractors that fetch info from
1514     http://www.youtube.com/feed_ajax
1515     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1516     """
1517     _LOGIN_REQUIRED = True
1518     # use action_load_personal_feed instead of action_load_system_feed
1519     _PERSONAL_FEED = False
1520
1521     @property
1522     def _FEED_TEMPLATE(self):
1523         action = 'action_load_system_feed'
1524         if self._PERSONAL_FEED:
1525             action = 'action_load_personal_feed'
1526         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1527
1528     @property
1529     def IE_NAME(self):
1530         return 'youtube:%s' % self._FEED_NAME
1531
1532     def _real_initialize(self):
1533         self._login()
1534
1535     def _real_extract(self, url):
1536         feed_entries = []
1537         paging = 0
1538         for i in itertools.count(1):
1539             info = self._download_json(self._FEED_TEMPLATE % paging,
1540                                        '%s feed' % self._FEED_NAME,
1541                                        'Downloading page %s' % i)
1542             feed_html = info.get('feed_html') or info.get('content_html')
1543             load_more_widget_html = info.get('load_more_widget_html') or feed_html
1544             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1545             ids = orderedSet(m.group(1) for m in m_ids)
1546             feed_entries.extend(
1547                 self.url_result(video_id, 'Youtube', video_id=video_id)
1548                 for video_id in ids)
1549             mobj = re.search(
1550                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1551                 load_more_widget_html)
1552             if mobj is None:
1553                 break
1554             paging = mobj.group('paging')
1555         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1556
1557
1558 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1559     IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1560     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1561     _FEED_NAME = 'recommended'
1562     _PLAYLIST_TITLE = 'Youtube Recommended videos'
1563
1564
1565 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1566     IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1567     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1568     _FEED_NAME = 'watch_later'
1569     _PLAYLIST_TITLE = 'Youtube Watch Later'
1570     _PERSONAL_FEED = True
1571
1572
1573 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1574     IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1575     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1576     _FEED_NAME = 'history'
1577     _PERSONAL_FEED = True
1578     _PLAYLIST_TITLE = 'Youtube Watch History'
1579
1580
1581 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1582     IE_NAME = 'youtube:favorites'
1583     IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1584     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1585     _LOGIN_REQUIRED = True
1586
1587     def _real_extract(self, url):
1588         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1589         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1590         return self.url_result(playlist_id, 'YoutubePlaylist')
1591
1592
1593 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1594     IE_NAME = 'youtube:subscriptions'
1595     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1596     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1597     _TESTS = []
1598
1599     def _real_extract(self, url):
1600         title = 'Youtube Subscriptions'
1601         page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1602
1603         # The extraction process is the same as for playlists, but the regex
1604         # for the video ids doesn't contain an index
1605         ids = []
1606         more_widget_html = content_html = page
1607
1608         for page_num in itertools.count(1):
1609             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1610             new_ids = orderedSet(matches)
1611             ids.extend(new_ids)
1612
1613             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1614             if not mobj:
1615                 break
1616
1617             more = self._download_json(
1618                 'https://youtube.com/%s' % mobj.group('more'), title,
1619                 'Downloading page #%s' % page_num,
1620                 transform_source=uppercase_escape)
1621             content_html = more['content_html']
1622             more_widget_html = more['load_more_widget_html']
1623
1624         return {
1625             '_type': 'playlist',
1626             'title': title,
1627             'entries': self._ids_to_results(ids),
1628         }
1629
1630
1631 class YoutubeTruncatedURLIE(InfoExtractor):
1632     IE_NAME = 'youtube:truncated_url'
1633     IE_DESC = False  # Do not list
1634     _VALID_URL = r'''(?x)
1635         (?:https?://)?[^/]+/watch\?(?:
1636             feature=[a-z_]+|
1637             annotation_id=annotation_[^&]+
1638         )?$|
1639         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1640     '''
1641
1642     _TESTS = [{
1643         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1644         'only_matching': True,
1645     }, {
1646         'url': 'http://www.youtube.com/watch?',
1647         'only_matching': True,
1648     }]
1649
1650     def _real_extract(self, url):
1651         raise ExtractorError(
1652             'Did you forget to quote the URL? Remember that & is a meta '
1653             'character in most shells, so you want to put the URL in quotes, '
1654             'like  youtube-dl '
1655             '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1656             ' or simply  youtube-dl BaW_jenozKc  .',
1657             expected=True)