_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import traceback
  11
  12 from .common import InfoExtractor, SearchInfoExtractor
  13 from .subtitles import SubtitlesInfoExtractor
  14 from ..jsinterp import JSInterpreter
  15 from ..swfinterp import SWFInterpreter
  16 from ..utils import (
  17     compat_chr,
  18     compat_parse_qs,
  19     compat_urllib_parse,
  20     compat_urllib_request,
  21     compat_urlparse,
  22     compat_str,
  23
  24     clean_html,
  25     get_element_by_id,
  26     get_element_by_attribute,
  27     ExtractorError,
  28     int_or_none,
  29     OnDemandPagedList,
  30     unescapeHTML,
  31     unified_strdate,
  32     orderedSet,
  33     uppercase_escape,
  34 )
  35
  36
  37 class YoutubeBaseInfoExtractor(InfoExtractor):
  38     """Provide base functions for Youtube extractors"""
  39     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  40     _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
  41     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  42     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  43     _NETRC_MACHINE = 'youtube'
  44     # If True it will raise an error if no login info is provided
  45     _LOGIN_REQUIRED = False
  46
  47     def _set_language(self):
  48         return bool(self._download_webpage(
  49             self._LANG_URL, None,
  50             note='Setting language', errnote='unable to set language',
  51             fatal=False))
  52
  53     def _login(self):
  54         """
  55         Attempt to log in to YouTube.
  56         True is returned if successful or skipped.
  57         False is returned if login failed.
  58
  59         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  60         """
  61         (username, password) = self._get_login_info()
  62         # No authentication to be performed
  63         if username is None:
  64             if self._LOGIN_REQUIRED:
  65                 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  66             return True
  67
  68         login_page = self._download_webpage(
  69             self._LOGIN_URL, None,
  70             note='Downloading login page',
  71             errnote='unable to fetch login page', fatal=False)
  72         if login_page is False:
  73             return
  74
  75         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  76                                   login_page, 'Login GALX parameter')
  77
  78         # Log in
  79         login_form_strs = {
  80                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  81                 'Email': username,
  82                 'GALX': galx,
  83                 'Passwd': password,
  84
  85                 'PersistentCookie': 'yes',
  86                 '_utf8': '霱',
  87                 'bgresponse': 'js_disabled',
  88                 'checkConnection': '',
  89                 'checkedDomains': 'youtube',
  90                 'dnConn': '',
  91                 'pstMsg': '0',
  92                 'rmShown': '1',
  93                 'secTok': '',
  94                 'signIn': 'Sign in',
  95                 'timeStmp': '',
  96                 'service': 'youtube',
  97                 'uilel': '3',
  98                 'hl': 'en_US',
  99         }
 100
 101         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 102         # chokes on unicode
 103         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
 104         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 105
 106         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 107         login_results = self._download_webpage(
 108             req, None,
 109             note='Logging in', errnote='unable to log in', fatal=False)
 110         if login_results is False:
 111             return False
 112
 113         if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 114             raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 115
 116         # Two-Factor
 117         # TODO add SMS and phone call support - these require making a request and then prompting the user
 118
 119         if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
 120             tfa_code = self._get_tfa_info()
 121
 122             if tfa_code is None:
 123                 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
 124                 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 125                 return False
 126
 127             # Unlike the first login form, secTok and timeStmp are both required for the TFA form
 128
 129             match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 130             if match is None:
 131                 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
 132             secTok = match.group(1)
 133             match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 134             if match is None:
 135                 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
 136             timeStmp = match.group(1)
 137
 138             tfa_form_strs = {
 139                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 140                 'smsToken': '',
 141                 'smsUserPin': tfa_code,
 142                 'smsVerifyPin': 'Verify',
 143
 144                 'PersistentCookie': 'yes',
 145                 'checkConnection': '',
 146                 'checkedDomains': 'youtube',
 147                 'pstMsg': '1',
 148                 'secTok': secTok,
 149                 'timeStmp': timeStmp,
 150                 'service': 'youtube',
 151                 'hl': 'en_US',
 152             }
 153             tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
 154             tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 155
 156             tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 157             tfa_results = self._download_webpage(
 158                 tfa_req, None,
 159                 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
 160
 161             if tfa_results is False:
 162                 return False
 163
 164             if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
 165                 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
 166                 return False
 167             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 168                 self._downloader.report_warning('unable to log in - did the page structure change?')
 169                 return False
 170             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 171                 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 172                 return False
 173
 174         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 175             self._downloader.report_warning('unable to log in: bad username or password')
 176             return False
 177         return True
 178
 179     def _confirm_age(self):
 180         age_form = {
 181             'next_url': '/',
 182             'action_confirm': 'Confirm',
 183         }
 184         req = compat_urllib_request.Request(self._AGE_URL,
 185             compat_urllib_parse.urlencode(age_form).encode('ascii'))
 186
 187         self._download_webpage(
 188             req, None,
 189             note='Confirming age', errnote='Unable to confirm age',
 190             fatal=False)
 191
 192     def _real_initialize(self):
 193         if self._downloader is None:
 194             return
 195         if self._get_login_info()[0] is not None:
 196             if not self._set_language():
 197                 return
 198         if not self._login():
 199             return
 200         self._confirm_age()
 201
 202
 203 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 204     IE_DESC = 'YouTube.com'
 205     _VALID_URL = r"""(?x)^
 206                      (
 207                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
 208                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 209                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 210                             (?:www\.)?pwnyoutube\.com/|
 211                             (?:www\.)?yourepeat\.com/|
 212                             tube\.majestyc\.net/|
 213                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 214                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 215                          (?:                                                  # the various things that can precede the ID:
 216                              (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
 217                              |(?:                                             # or the v= param in all its forms
 218                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 219                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 220                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 221                                  v=
 222                              )
 223                          ))
 224                          |youtu\.be/                                          # just youtu.be/xxxx
 225                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 226                          )
 227                      )?                                                       # all until now is optional -> you can pass the naked ID
 228                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 229                      (?!.*?&list=)                                            # combined list/video URLs are handled by the playlist IE
 230                      (?(1).+)?                                                # if we found the ID, everything can follow
 231                      $"""
 232     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 233     _formats = {
 234         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 235         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 236         '13': {'ext': '3gp'},
 237         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 238         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 239         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 240         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 241         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 242         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 243         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 244         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 245         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 246         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 247         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 248         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 249
 250
 251         # 3d videos
 252         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 253         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 254         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 255         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 256         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 257         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 258         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 259
 260         # Apple HTTP Live Streaming
 261         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 262         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 263         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 264         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 265         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 266         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 267         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 268
 269         # DASH mp4 video
 270         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 271         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 272         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 273         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 274         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 275         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 276         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 277         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 278         '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 279         '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 280         '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
 281
 282         # Dash mp4 audio
 283         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 284         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 285         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 286
 287         # Dash webm
 288         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 289         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 290         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 291         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 292         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 293         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 294         '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
 295         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 296         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 297         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 298         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 299         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 300         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 301         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 302         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 303         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 304         '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 305         '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 306
 307         # Dash webm audio
 308         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
 309         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 310
 311         # Dash webm audio with opus inside
 312         '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
 313         '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
 314         '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
 315
 316         # RTMP (unnamed)
 317         '_rtmp': {'protocol': 'rtmp'},
 318     }
 319
 320     IE_NAME = 'youtube'
 321     _TESTS = [
 322         {
 323             'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
 324             'info_dict': {
 325                 'id': 'BaW_jenozKc',
 326                 'ext': 'mp4',
 327                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
 328                 'uploader': 'Philipp Hagemeister',
 329                 'uploader_id': 'phihag',
 330                 'upload_date': '20121002',
 331                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
 332                 'categories': ['Science & Technology'],
 333                 'like_count': int,
 334                 'dislike_count': int,
 335             }
 336         },
 337         {
 338             'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
 339             'note': 'Test generic use_cipher_signature video (#897)',
 340             'info_dict': {
 341                 'id': 'UxxajLWwzqY',
 342                 'ext': 'mp4',
 343                 'upload_date': '20120506',
 344                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
 345                 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
 346                 'uploader': 'Icona Pop',
 347                 'uploader_id': 'IconaPop',
 348             }
 349         },
 350         {
 351             'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
 352             'note': 'Test VEVO video with age protection (#956)',
 353             'info_dict': {
 354                 'id': '07FYdnEawAQ',
 355                 'ext': 'mp4',
 356                 'upload_date': '20130703',
 357                 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
 358                 'description': 'md5:64249768eec3bc4276236606ea996373',
 359                 'uploader': 'justintimberlakeVEVO',
 360                 'uploader_id': 'justintimberlakeVEVO',
 361             }
 362         },
 363         {
 364             'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
 365             'note': 'Embed-only video (#1746)',
 366             'info_dict': {
 367                 'id': 'yZIXLfi8CZQ',
 368                 'ext': 'mp4',
 369                 'upload_date': '20120608',
 370                 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
 371                 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
 372                 'uploader': 'SET India',
 373                 'uploader_id': 'setindia'
 374             }
 375         },
 376         {
 377             'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
 378             'note': '256k DASH audio (format 141) via DASH manifest',
 379             'info_dict': {
 380                 'id': 'a9LDPn-MO4I',
 381                 'ext': 'm4a',
 382                 'upload_date': '20121002',
 383                 'uploader_id': '8KVIDEO',
 384                 'description': '',
 385                 'uploader': '8KVIDEO',
 386                 'title': 'UHDTV TEST 8K VIDEO.mp4'
 387             },
 388             'params': {
 389                 'youtube_include_dash_manifest': True,
 390                 'format': '141',
 391             },
 392         },
 393         # DASH manifest with encrypted signature
 394         {
 395             'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 396             'info_dict': {
 397                 'id': 'IB3lcPjvWLA',
 398                 'ext': 'm4a',
 399                 'title': 'Afrojack - The Spark ft. Spree Wilson',
 400                 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
 401                 'uploader': 'AfrojackVEVO',
 402                 'uploader_id': 'AfrojackVEVO',
 403                 'upload_date': '20131011',
 404             },
 405             'params': {
 406                 'youtube_include_dash_manifest': True,
 407                 'format': '141',
 408             },
 409         },
 410         # Controversy video
 411         {
 412             'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
 413             'info_dict': {
 414                 'id': 'T4XJQO3qol8',
 415                 'ext': 'mp4',
 416                 'upload_date': '20100909',
 417                 'uploader': 'The Amazing Atheist',
 418                 'uploader_id': 'TheAmazingAtheist',
 419                 'title': 'Burning Everyone\'s Koran',
 420                 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
 421             }
 422         }
 423     ]
 424
 425     def __init__(self, *args, **kwargs):
 426         super(YoutubeIE, self).__init__(*args, **kwargs)
 427         self._player_cache = {}
 428
 429     def report_video_info_webpage_download(self, video_id):
 430         """Report attempt to download video info webpage."""
 431         self.to_screen('%s: Downloading video info webpage' % video_id)
 432
 433     def report_information_extraction(self, video_id):
 434         """Report attempt to extract video information."""
 435         self.to_screen('%s: Extracting video information' % video_id)
 436
 437     def report_unavailable_format(self, video_id, format):
 438         """Report extracted video URL."""
 439         self.to_screen('%s: Format %s not available' % (video_id, format))
 440
 441     def report_rtmp_download(self):
 442         """Indicate the download will use the RTMP protocol."""
 443         self.to_screen('RTMP download detected')
 444
 445     def _signature_cache_id(self, example_sig):
 446         """ Return a string representation of a signature """
 447         return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 448
 449     def _extract_signature_function(self, video_id, player_url, example_sig):
 450         id_m = re.match(
 451             r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
 452             player_url)
 453         if not id_m:
 454             raise ExtractorError('Cannot identify player %r' % player_url)
 455         player_type = id_m.group('ext')
 456         player_id = id_m.group('id')
 457
 458         # Read from filesystem cache
 459         func_id = '%s_%s_%s' % (
 460             player_type, player_id, self._signature_cache_id(example_sig))
 461         assert os.path.basename(func_id) == func_id
 462
 463         cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
 464         if cache_spec is not None:
 465             return lambda s: ''.join(s[i] for i in cache_spec)
 466
 467         if player_type == 'js':
 468             code = self._download_webpage(
 469                 player_url, video_id,
 470                 note='Downloading %s player %s' % (player_type, player_id),
 471                 errnote='Download of %s failed' % player_url)
 472             res = self._parse_sig_js(code)
 473         elif player_type == 'swf':
 474             urlh = self._request_webpage(
 475                 player_url, video_id,
 476                 note='Downloading %s player %s' % (player_type, player_id),
 477                 errnote='Download of %s failed' % player_url)
 478             code = urlh.read()
 479             res = self._parse_sig_swf(code)
 480         else:
 481             assert False, 'Invalid player type %r' % player_type
 482
 483         if cache_spec is None:
 484             test_string = ''.join(map(compat_chr, range(len(example_sig))))
 485             cache_res = res(test_string)
 486             cache_spec = [ord(c) for c in cache_res]
 487
 488         self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
 489         return res
 490
 491     def _print_sig_code(self, func, example_sig):
 492         def gen_sig_code(idxs):
 493             def _genslice(start, end, step):
 494                 starts = '' if start == 0 else str(start)
 495                 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
 496                 steps = '' if step == 1 else (':%d' % step)
 497                 return 's[%s%s%s]' % (starts, ends, steps)
 498
 499             step = None
 500             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 501                                     # set as soon as step is set
 502             for i, prev in zip(idxs[1:], idxs[:-1]):
 503                 if step is not None:
 504                     if i - prev == step:
 505                         continue
 506                     yield _genslice(start, prev, step)
 507                     step = None
 508                     continue
 509                 if i - prev in [-1, 1]:
 510                     step = i - prev
 511                     start = prev
 512                     continue
 513                 else:
 514                     yield 's[%d]' % prev
 515             if step is None:
 516                 yield 's[%d]' % i
 517             else:
 518                 yield _genslice(start, i, step)
 519
 520         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 521         cache_res = func(test_string)
 522         cache_spec = [ord(c) for c in cache_res]
 523         expr_code = ' + '.join(gen_sig_code(cache_spec))
 524         signature_id_tuple = '(%s)' % (
 525             ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 526         code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 527                 '    return %s\n') % (signature_id_tuple, expr_code)
 528         self.to_screen('Extracted signature function:\n' + code)
 529
 530     def _parse_sig_js(self, jscode):
 531         funcname = self._search_regex(
 532             r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
 533              'Initial JS player signature function name')
 534
 535         jsi = JSInterpreter(jscode)
 536         initial_function = jsi.extract_function(funcname)
 537         return lambda s: initial_function([s])
 538
 539     def _parse_sig_swf(self, file_contents):
 540         swfi = SWFInterpreter(file_contents)
 541         TARGET_CLASSNAME = 'SignatureDecipher'
 542         searched_class = swfi.extract_class(TARGET_CLASSNAME)
 543         initial_function = swfi.extract_function(searched_class, 'decipher')
 544         return lambda s: initial_function([s])
 545
 546     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 547         """Turn the encrypted s field into a working signature"""
 548
 549         if player_url is None:
 550             raise ExtractorError('Cannot decrypt signature without player_url')
 551
 552         if player_url.startswith('//'):
 553             player_url = 'https:' + player_url
 554         try:
 555             player_id = (player_url, self._signature_cache_id(s))
 556             if player_id not in self._player_cache:
 557                 func = self._extract_signature_function(
 558                     video_id, player_url, s
 559                 )
 560                 self._player_cache[player_id] = func
 561             func = self._player_cache[player_id]
 562             if self._downloader.params.get('youtube_print_sig_code'):
 563                 self._print_sig_code(func, s)
 564             return func(s)
 565         except Exception as e:
 566             tb = traceback.format_exc()
 567             raise ExtractorError(
 568                 'Signature extraction failed: ' + tb, cause=e)
 569
 570     def _get_available_subtitles(self, video_id, webpage):
 571         try:
 572             sub_list = self._download_webpage(
 573                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 574                 video_id, note=False)
 575         except ExtractorError as err:
 576             self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
 577             return {}
 578         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 579
 580         sub_lang_list = {}
 581         for l in lang_list:
 582             lang = l[1]
 583             if lang in sub_lang_list:
 584                 continue
 585             params = compat_urllib_parse.urlencode({
 586                 'lang': lang,
 587                 'v': video_id,
 588                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 589                 'name': unescapeHTML(l[0]).encode('utf-8'),
 590             })
 591             url = 'https://www.youtube.com/api/timedtext?' + params
 592             sub_lang_list[lang] = url
 593         if not sub_lang_list:
 594             self._downloader.report_warning('video doesn\'t have subtitles')
 595             return {}
 596         return sub_lang_list
 597
 598     def _get_available_automatic_caption(self, video_id, webpage):
 599         """We need the webpage for getting the captions url, pass it as an
 600            argument to speed up the process."""
 601         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 602         self.to_screen('%s: Looking for automatic captions' % video_id)
 603         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 604         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
 605         if mobj is None:
 606             self._downloader.report_warning(err_msg)
 607             return {}
 608         player_config = json.loads(mobj.group(1))
 609         try:
 610             args = player_config[u'args']
 611             caption_url = args[u'ttsurl']
 612             timestamp = args[u'timestamp']
 613             # We get the available subtitles
 614             list_params = compat_urllib_parse.urlencode({
 615                 'type': 'list',
 616                 'tlangs': 1,
 617                 'asrs': 1,
 618             })
 619             list_url = caption_url + '&' + list_params
 620             caption_list = self._download_xml(list_url, video_id)
 621             original_lang_node = caption_list.find('track')
 622             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
 623                 self._downloader.report_warning('Video doesn\'t have automatic captions')
 624                 return {}
 625             original_lang = original_lang_node.attrib['lang_code']
 626
 627             sub_lang_list = {}
 628             for lang_node in caption_list.findall('target'):
 629                 sub_lang = lang_node.attrib['lang_code']
 630                 params = compat_urllib_parse.urlencode({
 631                     'lang': original_lang,
 632                     'tlang': sub_lang,
 633                     'fmt': sub_format,
 634                     'ts': timestamp,
 635                     'kind': 'asr',
 636                 })
 637                 sub_lang_list[sub_lang] = caption_url + '&' + params
 638             return sub_lang_list
 639         # An extractor error can be raise by the download process if there are
 640         # no automatic captions but there are subtitles
 641         except (KeyError, ExtractorError):
 642             self._downloader.report_warning(err_msg)
 643             return {}
 644
 645     @classmethod
 646     def extract_id(cls, url):
 647         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 648         if mobj is None:
 649             raise ExtractorError('Invalid URL: %s' % url)
 650         video_id = mobj.group(2)
 651         return video_id
 652
 653     def _extract_from_m3u8(self, manifest_url, video_id):
 654         url_map = {}
 655
 656         def _get_urls(_manifest):
 657             lines = _manifest.split('\n')
 658             urls = filter(lambda l: l and not l.startswith('#'),
 659                             lines)
 660             return urls
 661         manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
 662         formats_urls = _get_urls(manifest)
 663         for format_url in formats_urls:
 664             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 665             url_map[itag] = format_url
 666         return url_map
 667
 668     def _extract_annotations(self, video_id):
 669         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 670         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 671
 672     def _real_extract(self, url):
 673         proto = (
 674             'http' if self._downloader.params.get('prefer_insecure', False)
 675             else 'https')
 676
 677         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 678         mobj = re.search(self._NEXT_URL_RE, url)
 679         if mobj:
 680             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 681         video_id = self.extract_id(url)
 682
 683         # Get video webpage
 684         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
 685         pref_cookies = [
 686             c for c in self._downloader.cookiejar
 687             if c.domain == '.youtube.com' and c.name == 'PREF']
 688         for pc in pref_cookies:
 689             if 'hl=' in pc.value:
 690                 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
 691             else:
 692                 if pc.value:
 693                     pc.value += '&'
 694                 pc.value += 'hl=en'
 695         video_webpage = self._download_webpage(url, video_id)
 696
 697         # Attempt to extract SWF player URL
 698         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 699         if mobj is not None:
 700             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 701         else:
 702             player_url = None
 703
 704         # Get video info
 705         self.report_video_info_webpage_download(video_id)
 706         if re.search(r'player-age-gate-content">', video_webpage) is not None:
 707             age_gate = True
 708             # We simulate the access to the video from www.youtube.com/v/{video_id}
 709             # this can be viewed without login into Youtube
 710             data = compat_urllib_parse.urlencode({
 711                 'video_id': video_id,
 712                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
 713                 'sts': self._search_regex(
 714                     r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
 715             })
 716             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
 717             video_info_webpage = self._download_webpage(
 718                 video_info_url, video_id,
 719                 note='Refetching age-gated info webpage',
 720                 errnote='unable to download video info webpage')
 721             video_info = compat_parse_qs(video_info_webpage)
 722         else:
 723             age_gate = False
 724             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 725                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 726                         % (video_id, el_type))
 727                 video_info_webpage = self._download_webpage(video_info_url, video_id,
 728                                         note=False,
 729                                         errnote='unable to download video info webpage')
 730                 video_info = compat_parse_qs(video_info_webpage)
 731                 if 'token' in video_info:
 732                     break
 733         if 'token' not in video_info:
 734             if 'reason' in video_info:
 735                 raise ExtractorError(
 736                     'YouTube said: %s' % video_info['reason'][0],
 737                     expected=True, video_id=video_id)
 738             else:
 739                 raise ExtractorError(
 740                     '"token" parameter not in video info for unknown reason',
 741                     video_id=video_id)
 742
 743         if 'view_count' in video_info:
 744             view_count = int(video_info['view_count'][0])
 745         else:
 746             view_count = None
 747
 748         # Check for "rental" videos
 749         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 750             raise ExtractorError('"rental" videos not supported')
 751
 752         # Start extracting information
 753         self.report_information_extraction(video_id)
 754
 755         # uploader
 756         if 'author' not in video_info:
 757             raise ExtractorError('Unable to extract uploader name')
 758         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 759
 760         # uploader_id
 761         video_uploader_id = None
 762         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 763         if mobj is not None:
 764             video_uploader_id = mobj.group(1)
 765         else:
 766             self._downloader.report_warning('unable to extract uploader nickname')
 767
 768         # title
 769         if 'title' in video_info:
 770             video_title = video_info['title'][0]
 771         else:
 772             self._downloader.report_warning('Unable to extract video title')
 773             video_title = '_'
 774
 775         # thumbnail image
 776         # We try first to get a high quality image:
 777         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
 778                             video_webpage, re.DOTALL)
 779         if m_thumb is not None:
 780             video_thumbnail = m_thumb.group(1)
 781         elif 'thumbnail_url' not in video_info:
 782             self._downloader.report_warning('unable to extract video thumbnail')
 783             video_thumbnail = None
 784         else:   # don't panic if we can't find it
 785             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 786
 787         # upload date
 788         upload_date = None
 789         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
 790         if mobj is None:
 791             mobj = re.search(
 792                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
 793                 video_webpage)
 794         if mobj is not None:
 795             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 796             upload_date = unified_strdate(upload_date)
 797
 798         m_cat_container = self._search_regex(
 799             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
 800             video_webpage, 'categories', fatal=False)
 801         if m_cat_container:
 802             category = self._html_search_regex(
 803                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
 804                 default=None)
 805             video_categories = None if category is None else [category]
 806         else:
 807             video_categories = None
 808
 809         # description
 810         video_description = get_element_by_id("eow-description", video_webpage)
 811         if video_description:
 812             video_description = re.sub(r'''(?x)
 813                 <a\s+
 814                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 815                     title="([^"]+)"\s+
 816                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 817                     class="yt-uix-redirect-link"\s*>
 818                 [^<]+
 819                 </a>
 820             ''', r'\1', video_description)
 821             video_description = clean_html(video_description)
 822         else:
 823             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 824             if fd_mobj:
 825                 video_description = unescapeHTML(fd_mobj.group(1))
 826             else:
 827                 video_description = ''
 828
 829         def _extract_count(count_name):
 830             count = self._search_regex(
 831                 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
 832                 video_webpage, count_name, default=None)
 833             if count is not None:
 834                 return int(count.replace(',', ''))
 835             return None
 836         like_count = _extract_count('like')
 837         dislike_count = _extract_count('dislike')
 838
 839         # subtitles
 840         video_subtitles = self.extract_subtitles(video_id, video_webpage)
 841
 842         if self._downloader.params.get('listsubtitles', False):
 843             self._list_available_subtitles(video_id, video_webpage)
 844             return
 845
 846         if 'length_seconds' not in video_info:
 847             self._downloader.report_warning('unable to extract video duration')
 848             video_duration = None
 849         else:
 850             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
 851
 852         # annotations
 853         video_annotations = None
 854         if self._downloader.params.get('writeannotations', False):
 855             video_annotations = self._extract_annotations(video_id)
 856
 857         # Decide which formats to download
 858         try:
 859             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
 860             if not mobj:
 861                 raise ValueError('Could not find vevo ID')
 862             json_code = uppercase_escape(mobj.group(1))
 863             ytplayer_config = json.loads(json_code)
 864             args = ytplayer_config['args']
 865             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
 866             # this signatures are encrypted
 867             if 'url_encoded_fmt_stream_map' not in args:
 868                 raise ValueError('No stream_map present')  # caught below
 869             re_signature = re.compile(r'[&,]s=')
 870             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
 871             if m_s is not None:
 872                 self.to_screen('%s: Encrypted signatures detected.' % video_id)
 873                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
 874             m_s = re_signature.search(args.get('adaptive_fmts', ''))
 875             if m_s is not None:
 876                 if 'adaptive_fmts' in video_info:
 877                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
 878                 else:
 879                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
 880         except ValueError:
 881             pass
 882
 883         def _map_to_format_list(urlmap):
 884             formats = []
 885             for itag, video_real_url in urlmap.items():
 886                 dct = {
 887                     'format_id': itag,
 888                     'url': video_real_url,
 889                     'player_url': player_url,
 890                 }
 891                 if itag in self._formats:
 892                     dct.update(self._formats[itag])
 893                 formats.append(dct)
 894             return formats
 895
 896         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 897             self.report_rtmp_download()
 898             formats = [{
 899                 'format_id': '_rtmp',
 900                 'protocol': 'rtmp',
 901                 'url': video_info['conn'][0],
 902                 'player_url': player_url,
 903             }]
 904         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
 905             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
 906             if 'rtmpe%3Dyes' in encoded_url_map:
 907                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
 908             url_map = {}
 909             for url_data_str in encoded_url_map.split(','):
 910                 url_data = compat_parse_qs(url_data_str)
 911                 if 'itag' not in url_data or 'url' not in url_data:
 912                     continue
 913                 format_id = url_data['itag'][0]
 914                 url = url_data['url'][0]
 915
 916                 if 'sig' in url_data:
 917                     url += '&signature=' + url_data['sig'][0]
 918                 elif 's' in url_data:
 919                     encrypted_sig = url_data['s'][0]
 920
 921                     if not age_gate:
 922                         jsplayer_url_json = self._search_regex(
 923                             r'"assets":.+?"js":\s*("[^"]+")',
 924                             video_webpage, 'JS player URL')
 925                         player_url = json.loads(jsplayer_url_json)
 926                     if player_url is None:
 927                         player_url_json = self._search_regex(
 928                             r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
 929                             video_webpage, 'age gate player URL')
 930                         player_url = json.loads(player_url_json)
 931
 932                     if self._downloader.params.get('verbose'):
 933                         if player_url is None:
 934                             player_version = 'unknown'
 935                             player_desc = 'unknown'
 936                         else:
 937                             if player_url.endswith('swf'):
 938                                 player_version = self._search_regex(
 939                                     r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
 940                                     'flash player', fatal=False)
 941                                 player_desc = 'flash player %s' % player_version
 942                             else:
 943                                 player_version = self._search_regex(
 944                                     r'html5player-([^/]+?)(?:/html5player)?\.js',
 945                                     player_url,
 946                                     'html5 player', fatal=False)
 947                                 player_desc = 'html5 player %s' % player_version
 948
 949                         parts_sizes = self._signature_cache_id(encrypted_sig)
 950                         self.to_screen('{%s} signature length %s, %s' %
 951                             (format_id, parts_sizes, player_desc))
 952
 953                     signature = self._decrypt_signature(
 954                         encrypted_sig, video_id, player_url, age_gate)
 955                     url += '&signature=' + signature
 956                 if 'ratebypass' not in url:
 957                     url += '&ratebypass=yes'
 958                 url_map[format_id] = url
 959             formats = _map_to_format_list(url_map)
 960         elif video_info.get('hlsvp'):
 961             manifest_url = video_info['hlsvp'][0]
 962             url_map = self._extract_from_m3u8(manifest_url, video_id)
 963             formats = _map_to_format_list(url_map)
 964         else:
 965             raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
 966
 967         # Look for the DASH manifest
 968         if self._downloader.params.get('youtube_include_dash_manifest', True):
 969             try:
 970                 # The DASH manifest used needs to be the one from the original video_webpage.
 971                 # The one found in get_video_info seems to be using different signatures.
 972                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
 973                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
 974                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
 975                 if age_gate:
 976                     dash_manifest_url = video_info.get('dashmpd')[0]
 977                 else:
 978                     dash_manifest_url = ytplayer_config['args']['dashmpd']
 979
 980                 def decrypt_sig(mobj):
 981                     s = mobj.group(1)
 982                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
 983                     return '/signature/%s' % dec_s
 984                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
 985                 dash_doc = self._download_xml(
 986                     dash_manifest_url, video_id,
 987                     note='Downloading DASH manifest',
 988                     errnote='Could not download DASH manifest')
 989                 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 990                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 991                     if url_el is None:
 992                         continue
 993                     format_id = r.attrib['id']
 994                     video_url = url_el.text
 995                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 996                     f = {
 997                         'format_id': format_id,
 998                         'url': video_url,
 999                         'width': int_or_none(r.attrib.get('width')),
1000                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1001                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1002                         'filesize': filesize,
1003                     }
1004                     try:
1005                         existing_format = next(
1006                             fo for fo in formats
1007                             if fo['format_id'] == format_id)
1008                     except StopIteration:
1009                         f.update(self._formats.get(format_id, {}))
1010                         formats.append(f)
1011                     else:
1012                         existing_format.update(f)
1013
1014             except (ExtractorError, KeyError) as e:
1015                 self.report_warning('Skipping DASH manifest: %r' % e, video_id)
1016
1017         self._sort_formats(formats)
1018
1019         return {
1020             'id':           video_id,
1021             'uploader':     video_uploader,
1022             'uploader_id':  video_uploader_id,
1023             'upload_date':  upload_date,
1024             'title':        video_title,
1025             'thumbnail':    video_thumbnail,
1026             'description':  video_description,
1027             'categories':   video_categories,
1028             'subtitles':    video_subtitles,
1029             'duration':     video_duration,
1030             'age_limit':    18 if age_gate else 0,
1031             'annotations':  video_annotations,
1032             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1033             'view_count':   view_count,
1034             'like_count': like_count,
1035             'dislike_count': dislike_count,
1036             'formats':      formats,
1037         }
1038
1039
1040 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1041     IE_DESC = 'YouTube.com playlists'
1042     _VALID_URL = r"""(?x)(?:
1043                         (?:https?://)?
1044                         (?:\w+\.)?
1045                         youtube\.com/
1046                         (?:
1047                            (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1048                            \? (?:.*?&)*? (?:p|a|list)=
1049                         |  p/
1050                         )
1051                         (
1052                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1053                             # Top tracks, they can also include dots
1054                             |(?:MC)[\w\.]*
1055                         )
1056                         .*
1057                      |
1058                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1059                      )"""
1060     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1061     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1062     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1063     IE_NAME = 'youtube:playlist'
1064     _TESTS = [{
1065         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1066         'info_dict': {
1067             'title': 'ytdl test PL',
1068             'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1069         },
1070         'playlist_count': 3,
1071     }, {
1072         'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1073         'info_dict': {
1074             'title': 'YDL_Empty_List',
1075         },
1076         'playlist_count': 0,
1077     }, {
1078         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1079         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1080         'info_dict': {
1081             'title': '29C3: Not my department',
1082         },
1083         'playlist_count': 95,
1084     }, {
1085         'note': 'issue #673',
1086         'url': 'PLBB231211A4F62143',
1087         'info_dict': {
1088             'title': '[OLD]Team Fortress 2 (Class-based LP)',
1089         },
1090         'playlist_mincount': 26,
1091     }, {
1092         'note': 'Large playlist',
1093         'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1094         'info_dict': {
1095             'title': 'Uploads from Cauchemar',
1096         },
1097         'playlist_mincount': 799,
1098     }, {
1099         'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1100         'info_dict': {
1101             'title': 'YDL_safe_search',
1102         },
1103         'playlist_count': 2,
1104     }, {
1105         'note': 'embedded',
1106         'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1107         'playlist_count': 4,
1108         'info_dict': {
1109             'title': 'JODA15',
1110         }
1111     }, {
1112         'note': 'Embedded SWF player',
1113         'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1114         'playlist_count': 4,
1115         'info_dict': {
1116             'title': 'JODA7',
1117         }
1118     }]
1119
1120     def _real_initialize(self):
1121         self._login()
1122
1123     def _ids_to_results(self, ids):
1124         return [
1125             self.url_result(vid_id, 'Youtube', video_id=vid_id)
1126             for vid_id in ids]
1127
1128     def _extract_mix(self, playlist_id):
1129         # The mixes are generated from a a single video
1130         # the id of the playlist is just 'RD' + video_id
1131         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1132         webpage = self._download_webpage(
1133             url, playlist_id, 'Downloading Youtube mix')
1134         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1135         title_span = (
1136             search_title('playlist-title') or
1137             search_title('title long-title') or
1138             search_title('title'))
1139         title = clean_html(title_span)
1140         ids = orderedSet(re.findall(
1141             r'''(?xs)data-video-username=".*?".*?
1142                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1143             webpage))
1144         url_results = self._ids_to_results(ids)
1145
1146         return self.playlist_result(url_results, playlist_id, title)
1147
1148     def _real_extract(self, url):
1149         # Extract playlist id
1150         mobj = re.match(self._VALID_URL, url)
1151         if mobj is None:
1152             raise ExtractorError('Invalid URL: %s' % url)
1153         playlist_id = mobj.group(1) or mobj.group(2)
1154
1155         # Check if it's a video-specific URL
1156         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1157         if 'v' in query_dict:
1158             video_id = query_dict['v'][0]
1159             if self._downloader.params.get('noplaylist'):
1160                 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1161                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1162             else:
1163                 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1164
1165         if playlist_id.startswith('RD'):
1166             # Mixes require a custom extraction process
1167             return self._extract_mix(playlist_id)
1168         if playlist_id.startswith('TL'):
1169             raise ExtractorError('For downloading YouTube.com top lists, use '
1170                 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1171
1172         url = self._TEMPLATE_URL % playlist_id
1173         page = self._download_webpage(url, playlist_id)
1174         more_widget_html = content_html = page
1175
1176         # Check if the playlist exists or is private
1177         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1178             raise ExtractorError(
1179                 'The playlist doesn\'t exist or is private, use --username or '
1180                 '--netrc to access it.',
1181                 expected=True)
1182
1183         # Extract the video ids from the playlist pages
1184         ids = []
1185
1186         for page_num in itertools.count(1):
1187             matches = re.finditer(self._VIDEO_RE, content_html)
1188             # We remove the duplicates and the link with index 0
1189             # (it's not the first video of the playlist)
1190             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1191             ids.extend(new_ids)
1192
1193             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1194             if not mobj:
1195                 break
1196
1197             more = self._download_json(
1198                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1199                 'Downloading page #%s' % page_num,
1200                 transform_source=uppercase_escape)
1201             content_html = more['content_html']
1202             more_widget_html = more['load_more_widget_html']
1203
1204         playlist_title = self._html_search_regex(
1205             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1206             page, 'title')
1207
1208         url_results = self._ids_to_results(ids)
1209         return self.playlist_result(url_results, playlist_id, playlist_title)
1210
1211
1212 class YoutubeTopListIE(YoutubePlaylistIE):
1213     IE_NAME = 'youtube:toplist'
1214     IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1215         ' (Example: "yttoplist:music:Top Tracks")')
1216     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1217     _TESTS = [{
1218         'url': 'yttoplist:music:Trending',
1219         'playlist_mincount': 5,
1220         'skip': 'Only works for logged-in users',
1221     }]
1222
1223     def _real_extract(self, url):
1224         mobj = re.match(self._VALID_URL, url)
1225         channel = mobj.group('chann')
1226         title = mobj.group('title')
1227         query = compat_urllib_parse.urlencode({'title': title})
1228         channel_page = self._download_webpage(
1229             'https://www.youtube.com/%s' % channel, title)
1230         link = self._html_search_regex(
1231             r'''(?x)
1232                 <a\s+href="([^"]+)".*?>\s*
1233                 <span\s+class="branded-page-module-title-text">\s*
1234                 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1235             channel_page, 'list')
1236         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1237
1238         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1239         ids = []
1240         # sometimes the webpage doesn't contain the videos
1241         # retry until we get them
1242         for i in itertools.count(0):
1243             msg = 'Downloading Youtube mix'
1244             if i > 0:
1245                 msg += ', retry #%d' % i
1246
1247             webpage = self._download_webpage(url, title, msg)
1248             ids = orderedSet(re.findall(video_re, webpage))
1249             if ids:
1250                 break
1251         url_results = self._ids_to_results(ids)
1252         return self.playlist_result(url_results, playlist_title=title)
1253
1254
1255 class YoutubeChannelIE(InfoExtractor):
1256     IE_DESC = 'YouTube.com channels'
1257     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1258     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1259     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1260     IE_NAME = 'youtube:channel'
1261     _TESTS = [{
1262         'note': 'paginated channel',
1263         'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1264         'playlist_mincount': 91,
1265     }]
1266
1267     def extract_videos_from_page(self, page):
1268         ids_in_page = []
1269         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1270             if mobj.group(1) not in ids_in_page:
1271                 ids_in_page.append(mobj.group(1))
1272         return ids_in_page
1273
1274     def _real_extract(self, url):
1275         # Extract channel id
1276         mobj = re.match(self._VALID_URL, url)
1277         if mobj is None:
1278             raise ExtractorError('Invalid URL: %s' % url)
1279
1280         # Download channel page
1281         channel_id = mobj.group(1)
1282         video_ids = []
1283         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1284         channel_page = self._download_webpage(url, channel_id)
1285         autogenerated = re.search(r'''(?x)
1286                 class="[^"]*?(?:
1287                     channel-header-autogenerated-label|
1288                     yt-channel-title-autogenerated
1289                 )[^"]*"''', channel_page) is not None
1290
1291         if autogenerated:
1292             # The videos are contained in a single page
1293             # the ajax pages can't be used, they are empty
1294             video_ids = self.extract_videos_from_page(channel_page)
1295         else:
1296             # Download all channel pages using the json-based channel_ajax query
1297             for pagenum in itertools.count(1):
1298                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1299                 page = self._download_json(
1300                     url, channel_id, note='Downloading page #%s' % pagenum,
1301                     transform_source=uppercase_escape)
1302
1303                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1304                 video_ids.extend(ids_in_page)
1305
1306                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1307                     break
1308
1309         self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1310
1311         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1312                        for video_id in video_ids]
1313         return self.playlist_result(url_entries, channel_id)
1314
1315
1316 class YoutubeUserIE(InfoExtractor):
1317     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1318     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1319     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1320     _GDATA_PAGE_SIZE = 50
1321     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1322     IE_NAME = 'youtube:user'
1323
1324     _TESTS = [{
1325         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1326         'playlist_mincount': 320,
1327         'info_dict': {
1328             'title': 'TheLinuxFoundation',
1329         }
1330     }, {
1331         'url': 'ytuser:phihag',
1332         'only_matching': True,
1333     }]
1334
1335     @classmethod
1336     def suitable(cls, url):
1337         # Don't return True if the url can be extracted with other youtube
1338         # extractor, the regex would is too permissive and it would match.
1339         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1340         if any(ie.suitable(url) for ie in other_ies):
1341             return False
1342         else:
1343             return super(YoutubeUserIE, cls).suitable(url)
1344
1345     def _real_extract(self, url):
1346         # Extract username
1347         mobj = re.match(self._VALID_URL, url)
1348         if mobj is None:
1349             raise ExtractorError('Invalid URL: %s' % url)
1350
1351         username = mobj.group(1)
1352
1353         # Download video ids using YouTube Data API. Result size per
1354         # query is limited (currently to 50 videos) so we need to query
1355         # page by page until there are no video ids - it means we got
1356         # all of them.
1357
1358         def download_page(pagenum):
1359             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1360
1361             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1362             page = self._download_webpage(
1363                 gdata_url, username,
1364                 'Downloading video ids from %d to %d' % (
1365                     start_index, start_index + self._GDATA_PAGE_SIZE))
1366
1367             try:
1368                 response = json.loads(page)
1369             except ValueError as err:
1370                 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1371             if 'entry' not in response['feed']:
1372                 return
1373
1374             # Extract video identifiers
1375             entries = response['feed']['entry']
1376             for entry in entries:
1377                 title = entry['title']['$t']
1378                 video_id = entry['id']['$t'].split('/')[-1]
1379                 yield {
1380                     '_type': 'url',
1381                     'url': video_id,
1382                     'ie_key': 'Youtube',
1383                     'id': video_id,
1384                     'title': title,
1385                 }
1386         url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1387
1388         return self.playlist_result(url_results, playlist_title=username)
1389
1390
1391 class YoutubeSearchIE(SearchInfoExtractor):
1392     IE_DESC = 'YouTube.com searches'
1393     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1394     _MAX_RESULTS = 1000
1395     IE_NAME = 'youtube:search'
1396     _SEARCH_KEY = 'ytsearch'
1397
1398     def _get_n_results(self, query, n):
1399         """Get a specified number of results for a query"""
1400
1401         video_ids = []
1402         pagenum = 0
1403         limit = n
1404         PAGE_SIZE = 50
1405
1406         while (PAGE_SIZE * pagenum) < limit:
1407             result_url = self._API_URL % (
1408                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1409                 (PAGE_SIZE * pagenum) + 1)
1410             data_json = self._download_webpage(
1411                 result_url, video_id='query "%s"' % query,
1412                 note='Downloading page %s' % (pagenum + 1),
1413                 errnote='Unable to download API page')
1414             data = json.loads(data_json)
1415             api_response = data['data']
1416
1417             if 'items' not in api_response:
1418                 raise ExtractorError(
1419                     '[youtube] No video results', expected=True)
1420
1421             new_ids = list(video['id'] for video in api_response['items'])
1422             video_ids += new_ids
1423
1424             limit = min(n, api_response['totalItems'])
1425             pagenum += 1
1426
1427         if len(video_ids) > n:
1428             video_ids = video_ids[:n]
1429         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1430                   for video_id in video_ids]
1431         return self.playlist_result(videos, query)
1432
1433
1434 class YoutubeSearchDateIE(YoutubeSearchIE):
1435     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1436     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1437     _SEARCH_KEY = 'ytsearchdate'
1438     IE_DESC = 'YouTube.com searches, newest videos first'
1439
1440
1441 class YoutubeSearchURLIE(InfoExtractor):
1442     IE_DESC = 'YouTube.com search URLs'
1443     IE_NAME = 'youtube:search_url'
1444     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1445     _TESTS = [{
1446         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1447         'playlist_mincount': 5,
1448         'info_dict': {
1449             'title': 'youtube-dl test video',
1450         }
1451     }]
1452
1453     def _real_extract(self, url):
1454         mobj = re.match(self._VALID_URL, url)
1455         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1456
1457         webpage = self._download_webpage(url, query)
1458         result_code = self._search_regex(
1459             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1460
1461         part_codes = re.findall(
1462             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1463         entries = []
1464         for part_code in part_codes:
1465             part_title = self._html_search_regex(
1466                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1467             part_url_snippet = self._html_search_regex(
1468                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1469             part_url = compat_urlparse.urljoin(
1470                 'https://www.youtube.com/', part_url_snippet)
1471             entries.append({
1472                 '_type': 'url',
1473                 'url': part_url,
1474                 'title': part_title,
1475             })
1476
1477         return {
1478             '_type': 'playlist',
1479             'entries': entries,
1480             'title': query,
1481         }
1482
1483
1484 class YoutubeShowIE(InfoExtractor):
1485     IE_DESC = 'YouTube.com (multi-season) shows'
1486     _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1487     IE_NAME = 'youtube:show'
1488     _TESTS = [{
1489         'url': 'http://www.youtube.com/show/airdisasters',
1490         'playlist_mincount': 3,
1491         'info_dict': {
1492             'id': 'airdisasters',
1493             'title': 'Air Disasters',
1494         }
1495     }]
1496
1497     def _real_extract(self, url):
1498         mobj = re.match(self._VALID_URL, url)
1499         playlist_id = mobj.group('id')
1500         webpage = self._download_webpage(
1501             url, playlist_id, 'Downloading show webpage')
1502         # There's one playlist for each season of the show
1503         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1504         self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1505         entries = [
1506             self.url_result(
1507                 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1508             for season in m_seasons
1509         ]
1510         title = self._og_search_title(webpage, fatal=False)
1511
1512         return {
1513             '_type': 'playlist',
1514             'id': playlist_id,
1515             'title': title,
1516             'entries': entries,
1517         }
1518
1519
1520 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1521     """
1522     Base class for extractors that fetch info from
1523     http://www.youtube.com/feed_ajax
1524     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1525     """
1526     _LOGIN_REQUIRED = True
1527     # use action_load_personal_feed instead of action_load_system_feed
1528     _PERSONAL_FEED = False
1529
1530     @property
1531     def _FEED_TEMPLATE(self):
1532         action = 'action_load_system_feed'
1533         if self._PERSONAL_FEED:
1534             action = 'action_load_personal_feed'
1535         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1536
1537     @property
1538     def IE_NAME(self):
1539         return 'youtube:%s' % self._FEED_NAME
1540
1541     def _real_initialize(self):
1542         self._login()
1543
1544     def _real_extract(self, url):
1545         feed_entries = []
1546         paging = 0
1547         for i in itertools.count(1):
1548             info = self._download_json(self._FEED_TEMPLATE % paging,
1549                                           '%s feed' % self._FEED_NAME,
1550                                           'Downloading page %s' % i)
1551             feed_html = info.get('feed_html') or info.get('content_html')
1552             load_more_widget_html = info.get('load_more_widget_html') or feed_html
1553             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1554             ids = orderedSet(m.group(1) for m in m_ids)
1555             feed_entries.extend(
1556                 self.url_result(video_id, 'Youtube', video_id=video_id)
1557                 for video_id in ids)
1558             mobj = re.search(
1559                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1560                 load_more_widget_html)
1561             if mobj is None:
1562                 break
1563             paging = mobj.group('paging')
1564         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1565
1566
1567 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1568     IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1569     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1570     _FEED_NAME = 'recommended'
1571     _PLAYLIST_TITLE = 'Youtube Recommended videos'
1572
1573
1574 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1575     IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1576     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1577     _FEED_NAME = 'watch_later'
1578     _PLAYLIST_TITLE = 'Youtube Watch Later'
1579     _PERSONAL_FEED = True
1580
1581
1582 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1583     IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1584     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1585     _FEED_NAME = 'history'
1586     _PERSONAL_FEED = True
1587     _PLAYLIST_TITLE = 'Youtube Watch History'
1588
1589
1590 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1591     IE_NAME = 'youtube:favorites'
1592     IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1593     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1594     _LOGIN_REQUIRED = True
1595
1596     def _real_extract(self, url):
1597         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1598         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1599         return self.url_result(playlist_id, 'YoutubePlaylist')
1600
1601
1602 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1603     IE_NAME = 'youtube:subscriptions'
1604     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1605     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1606     _TESTS = []
1607
1608     def _real_extract(self, url):
1609         title = 'Youtube Subscriptions'
1610         page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1611
1612         # The extraction process is the same as for playlists, but the regex
1613         # for the video ids doesn't contain an index
1614         ids = []
1615         more_widget_html = content_html = page
1616
1617         for page_num in itertools.count(1):
1618             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1619             new_ids = orderedSet(matches)
1620             ids.extend(new_ids)
1621
1622             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1623             if not mobj:
1624                 break
1625
1626             more = self._download_json(
1627                 'https://youtube.com/%s' % mobj.group('more'), title,
1628                 'Downloading page #%s' % page_num,
1629                 transform_source=uppercase_escape)
1630             content_html = more['content_html']
1631             more_widget_html = more['load_more_widget_html']
1632
1633         return {
1634             '_type': 'playlist',
1635             'title': title,
1636             'entries': self._ids_to_results(ids),
1637         }
1638
1639
1640 class YoutubeTruncatedURLIE(InfoExtractor):
1641     IE_NAME = 'youtube:truncated_url'
1642     IE_DESC = False  # Do not list
1643     _VALID_URL = r'''(?x)
1644         (?:https?://)?[^/]+/watch\?(?:
1645             feature=[a-z_]+|
1646             annotation_id=annotation_[^&]+
1647         )?$|
1648         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1649     '''
1650
1651     _TESTS = [{
1652         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1653         'only_matching': True,
1654     }, {
1655         'url': 'http://www.youtube.com/watch?',
1656         'only_matching': True,
1657     }]
1658
1659     def _real_extract(self, url):
1660         raise ExtractorError(
1661             'Did you forget to quote the URL? Remember that & is a meta '
1662             'character in most shells, so you want to put the URL in quotes, '
1663             'like  youtube-dl '
1664             '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1665             ' or simply  youtube-dl BaW_jenozKc  .',
1666             expected=True)