_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 import collections
   4 import errno
   5 import io
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import struct
  11 import traceback
  12 import zlib
  13
  14 from .common import InfoExtractor, SearchInfoExtractor
  15 from .subtitles import SubtitlesInfoExtractor
  16 from ..jsinterp import JSInterpreter
  17 from ..utils import (
  18     compat_chr,
  19     compat_parse_qs,
  20     compat_urllib_parse,
  21     compat_urllib_request,
  22     compat_urlparse,
  23     compat_str,
  24
  25     clean_html,
  26     get_cachedir,
  27     get_element_by_id,
  28     get_element_by_attribute,
  29     ExtractorError,
  30     int_or_none,
  31     PagedList,
  32     unescapeHTML,
  33     unified_strdate,
  34     orderedSet,
  35     write_json_file,
  36     uppercase_escape,
  37 )
  38
  39 class YoutubeBaseInfoExtractor(InfoExtractor):
  40     """Provide base functions for Youtube extractors"""
  41     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  42     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  43     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  44     _NETRC_MACHINE = 'youtube'
  45     # If True it will raise an error if no login info is provided
  46     _LOGIN_REQUIRED = False
  47
  48     def _set_language(self):
  49         return bool(self._download_webpage(
  50             self._LANG_URL, None,
  51             note=u'Setting language', errnote='unable to set language',
  52             fatal=False))
  53
  54     def _login(self):
  55         (username, password) = self._get_login_info()
  56         # No authentication to be performed
  57         if username is None:
  58             if self._LOGIN_REQUIRED:
  59                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  60             return False
  61
  62         login_page = self._download_webpage(
  63             self._LOGIN_URL, None,
  64             note=u'Downloading login page',
  65             errnote=u'unable to fetch login page', fatal=False)
  66         if login_page is False:
  67             return
  68
  69         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  70                                   login_page, u'Login GALX parameter')
  71
  72         # Log in
  73         login_form_strs = {
  74                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  75                 u'Email': username,
  76                 u'GALX': galx,
  77                 u'Passwd': password,
  78                 u'PersistentCookie': u'yes',
  79                 u'_utf8': u'霱',
  80                 u'bgresponse': u'js_disabled',
  81                 u'checkConnection': u'',
  82                 u'checkedDomains': u'youtube',
  83                 u'dnConn': u'',
  84                 u'pstMsg': u'0',
  85                 u'rmShown': u'1',
  86                 u'secTok': u'',
  87                 u'signIn': u'Sign in',
  88                 u'timeStmp': u'',
  89                 u'service': u'youtube',
  90                 u'uilel': u'3',
  91                 u'hl': u'en_US',
  92         }
  93         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
  94         # chokes on unicode
  95         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
  96         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
  97
  98         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
  99         login_results = self._download_webpage(
 100             req, None,
 101             note=u'Logging in', errnote=u'unable to log in', fatal=False)
 102         if login_results is False:
 103             return False
 104         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 105             self._downloader.report_warning(u'unable to log in: bad username or password')
 106             return False
 107         return True
 108
 109     def _confirm_age(self):
 110         age_form = {
 111             'next_url': '/',
 112             'action_confirm': 'Confirm',
 113         }
 114         req = compat_urllib_request.Request(self._AGE_URL,
 115             compat_urllib_parse.urlencode(age_form).encode('ascii'))
 116
 117         self._download_webpage(
 118             req, None,
 119             note=u'Confirming age', errnote=u'Unable to confirm age')
 120         return True
 121
 122     def _real_initialize(self):
 123         if self._downloader is None:
 124             return
 125         if not self._set_language():
 126             return
 127         if not self._login():
 128             return
 129         self._confirm_age()
 130
 131
 132 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 133     IE_DESC = u'YouTube.com'
 134     _VALID_URL = r"""(?x)^
 135                      (
 136                          (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
 137                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 138                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 139                             (?:www\.)?pwnyoutube\.com/|
 140                             (?:www\.)?yourepeat\.com/|
 141                             tube\.majestyc\.net/|
 142                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 143                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 144                          (?:                                                  # the various things that can precede the ID:
 145                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 146                              |(?:                                             # or the v= param in all its forms
 147                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 148                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 149                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 150                                  v=
 151                              )
 152                          ))
 153                          |youtu\.be/                                          # just youtu.be/xxxx
 154                          |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 155                          )
 156                      )?                                                       # all until now is optional -> you can pass the naked ID
 157                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 158                      (?(1).+)?                                                # if we found the ID, everything can follow
 159                      $"""
 160     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 161     _formats = {
 162         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 163         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 164         '13': {'ext': '3gp'},
 165         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 166         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 167         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 168         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 169         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 170         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 171         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 172         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 173         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 174         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 175         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 176         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 177
 178
 179         # 3d videos
 180         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 181         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 182         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 183         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 184         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 185         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 186         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 187
 188         # Apple HTTP Live Streaming
 189         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 190         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 191         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 192         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 193         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 194         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 195         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 196
 197         # DASH mp4 video
 198         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 199         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 200         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 201         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 202         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 203         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 204         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 205         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 206
 207         # Dash mp4 audio
 208         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 209         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 210         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 211
 212         # Dash webm
 213         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 214         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 215         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 216         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 217         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 218         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 219         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 220         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 221         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 222         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 223         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 224         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 225         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 226         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 227         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 228
 229         # Dash webm audio
 230         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
 231         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 232
 233         # RTMP (unnamed)
 234         '_rtmp': {'protocol': 'rtmp'},
 235     }
 236
 237     IE_NAME = u'youtube'
 238     _TESTS = [
 239         {
 240             u"url":  u"http://www.youtube.com/watch?v=BaW_jenozKc",
 241             u"file":  u"BaW_jenozKc.mp4",
 242             u"info_dict": {
 243                 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
 244                 u"uploader": u"Philipp Hagemeister",
 245                 u"uploader_id": u"phihag",
 246                 u"upload_date": u"20121002",
 247                 u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
 248                 u"categories": [u'Science & Technology'],
 249             }
 250         },
 251         {
 252             u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
 253             u"file":  u"UxxajLWwzqY.mp4",
 254             u"note": u"Test generic use_cipher_signature video (#897)",
 255             u"info_dict": {
 256                 u"upload_date": u"20120506",
 257                 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
 258                 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
 259                 u"uploader": u"Icona Pop",
 260                 u"uploader_id": u"IconaPop"
 261             }
 262         },
 263         {
 264             u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
 265             u"file":  u"07FYdnEawAQ.mp4",
 266             u"note": u"Test VEVO video with age protection (#956)",
 267             u"info_dict": {
 268                 u"upload_date": u"20130703",
 269                 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
 270                 u"description": u"md5:64249768eec3bc4276236606ea996373",
 271                 u"uploader": u"justintimberlakeVEVO",
 272                 u"uploader_id": u"justintimberlakeVEVO"
 273             }
 274         },
 275         {
 276             u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
 277             u"file":  u"yZIXLfi8CZQ.mp4",
 278             u"note": u"Embed-only video (#1746)",
 279             u"info_dict": {
 280                 u"upload_date": u"20120608",
 281                 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
 282                 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
 283                 u"uploader": u"SET India",
 284                 u"uploader_id": u"setindia"
 285             }
 286         },
 287         {
 288             u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
 289             u"file": u"a9LDPn-MO4I.m4a",
 290             u"note": u"256k DASH audio (format 141) via DASH manifest",
 291             u"info_dict": {
 292                 u"upload_date": "20121002",
 293                 u"uploader_id": "8KVIDEO",
 294                 u"description": "No description available.",
 295                 u"uploader": "8KVIDEO",
 296                 u"title": "UHDTV TEST 8K VIDEO.mp4"
 297             },
 298             u"params": {
 299                 u"youtube_include_dash_manifest": True,
 300                 u"format": "141",
 301             },
 302         },
 303         # DASH manifest with encrypted signature
 304         {
 305             u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 306             u'info_dict': {
 307                 u'id': u'IB3lcPjvWLA',
 308                 u'ext': u'm4a',
 309                 u'title': u'Afrojack - The Spark ft. Spree Wilson',
 310                 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
 311                 u'uploader': u'AfrojackVEVO',
 312                 u'uploader_id': u'AfrojackVEVO',
 313                 u'upload_date': u'20131011',
 314             },
 315             u"params": {
 316                 u'youtube_include_dash_manifest': True,
 317                 u'format': '141',
 318             },
 319         },
 320     ]
 321
 322
 323     @classmethod
 324     def suitable(cls, url):
 325         """Receives a URL and returns True if suitable for this IE."""
 326         if YoutubePlaylistIE.suitable(url): return False
 327         return re.match(cls._VALID_URL, url) is not None
 328
 329     def __init__(self, *args, **kwargs):
 330         super(YoutubeIE, self).__init__(*args, **kwargs)
 331         self._player_cache = {}
 332
 333     def report_video_info_webpage_download(self, video_id):
 334         """Report attempt to download video info webpage."""
 335         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 336
 337     def report_information_extraction(self, video_id):
 338         """Report attempt to extract video information."""
 339         self.to_screen(u'%s: Extracting video information' % video_id)
 340
 341     def report_unavailable_format(self, video_id, format):
 342         """Report extracted video URL."""
 343         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 344
 345     def report_rtmp_download(self):
 346         """Indicate the download will use the RTMP protocol."""
 347         self.to_screen(u'RTMP download detected')
 348
 349     def _extract_signature_function(self, video_id, player_url, slen):
 350         id_m = re.match(
 351             r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3)?\.(?P<ext>[a-z]+)$',
 352             player_url)
 353         player_type = id_m.group('ext')
 354         player_id = id_m.group('id')
 355
 356         # Read from filesystem cache
 357         func_id = '%s_%s_%d' % (player_type, player_id, slen)
 358         assert os.path.basename(func_id) == func_id
 359         cache_dir = get_cachedir(self._downloader.params)
 360
 361         cache_enabled = cache_dir is not None
 362         if cache_enabled:
 363             cache_fn = os.path.join(os.path.expanduser(cache_dir),
 364                                     u'youtube-sigfuncs',
 365                                     func_id + '.json')
 366             try:
 367                 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
 368                     cache_spec = json.load(cachef)
 369                 return lambda s: u''.join(s[i] for i in cache_spec)
 370             except IOError:
 371                 pass  # No cache available
 372
 373         if player_type == 'js':
 374             code = self._download_webpage(
 375                 player_url, video_id,
 376                 note=u'Downloading %s player %s' % (player_type, player_id),
 377                 errnote=u'Download of %s failed' % player_url)
 378             res = self._parse_sig_js(code)
 379         elif player_type == 'swf':
 380             urlh = self._request_webpage(
 381                 player_url, video_id,
 382                 note=u'Downloading %s player %s' % (player_type, player_id),
 383                 errnote=u'Download of %s failed' % player_url)
 384             code = urlh.read()
 385             res = self._parse_sig_swf(code)
 386         else:
 387             assert False, 'Invalid player type %r' % player_type
 388
 389         if cache_enabled:
 390             try:
 391                 test_string = u''.join(map(compat_chr, range(slen)))
 392                 cache_res = res(test_string)
 393                 cache_spec = [ord(c) for c in cache_res]
 394                 try:
 395                     os.makedirs(os.path.dirname(cache_fn))
 396                 except OSError as ose:
 397                     if ose.errno != errno.EEXIST:
 398                         raise
 399                 write_json_file(cache_spec, cache_fn)
 400             except Exception:
 401                 tb = traceback.format_exc()
 402                 self._downloader.report_warning(
 403                     u'Writing cache to %r failed: %s' % (cache_fn, tb))
 404
 405         return res
 406
 407     def _print_sig_code(self, func, slen):
 408         def gen_sig_code(idxs):
 409             def _genslice(start, end, step):
 410                 starts = u'' if start == 0 else str(start)
 411                 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
 412                 steps = u'' if step == 1 else (u':%d' % step)
 413                 return u's[%s%s%s]' % (starts, ends, steps)
 414
 415             step = None
 416             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 417                                     # set as soon as step is set
 418             for i, prev in zip(idxs[1:], idxs[:-1]):
 419                 if step is not None:
 420                     if i - prev == step:
 421                         continue
 422                     yield _genslice(start, prev, step)
 423                     step = None
 424                     continue
 425                 if i - prev in [-1, 1]:
 426                     step = i - prev
 427                     start = prev
 428                     continue
 429                 else:
 430                     yield u's[%d]' % prev
 431             if step is None:
 432                 yield u's[%d]' % i
 433             else:
 434                 yield _genslice(start, i, step)
 435
 436         test_string = u''.join(map(compat_chr, range(slen)))
 437         cache_res = func(test_string)
 438         cache_spec = [ord(c) for c in cache_res]
 439         expr_code = u' + '.join(gen_sig_code(cache_spec))
 440         code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code)
 441         self.to_screen(u'Extracted signature function:\n' + code)
 442
 443     def _parse_sig_js(self, jscode):
 444         funcname = self._search_regex(
 445             r'signature=([$a-zA-Z]+)', jscode,
 446              u'Initial JS player signature function name')
 447
 448         jsi = JSInterpreter(jscode)
 449         initial_function = jsi.extract_function(funcname)
 450         return lambda s: initial_function([s])
 451
 452     def _parse_sig_swf(self, file_contents):
 453         if file_contents[1:3] != b'WS':
 454             raise ExtractorError(
 455                 u'Not an SWF file; header is %r' % file_contents[:3])
 456         if file_contents[:1] == b'C':
 457             content = zlib.decompress(file_contents[8:])
 458         else:
 459             raise NotImplementedError(u'Unsupported compression format %r' %
 460                                       file_contents[:1])
 461
 462         def extract_tags(content):
 463             pos = 0
 464             while pos < len(content):
 465                 header16 = struct.unpack('<H', content[pos:pos+2])[0]
 466                 pos += 2
 467                 tag_code = header16 >> 6
 468                 tag_len = header16 & 0x3f
 469                 if tag_len == 0x3f:
 470                     tag_len = struct.unpack('<I', content[pos:pos+4])[0]
 471                     pos += 4
 472                 assert pos+tag_len <= len(content)
 473                 yield (tag_code, content[pos:pos+tag_len])
 474                 pos += tag_len
 475
 476         code_tag = next(tag
 477                         for tag_code, tag in extract_tags(content)
 478                         if tag_code == 82)
 479         p = code_tag.index(b'\0', 4) + 1
 480         code_reader = io.BytesIO(code_tag[p:])
 481
 482         # Parse ABC (AVM2 ByteCode)
 483         def read_int(reader=None):
 484             if reader is None:
 485                 reader = code_reader
 486             res = 0
 487             shift = 0
 488             for _ in range(5):
 489                 buf = reader.read(1)
 490                 assert len(buf) == 1
 491                 b = struct.unpack('<B', buf)[0]
 492                 res = res | ((b & 0x7f) << shift)
 493                 if b & 0x80 == 0:
 494                     break
 495                 shift += 7
 496             return res
 497
 498         def u30(reader=None):
 499             res = read_int(reader)
 500             assert res & 0xf0000000 == 0
 501             return res
 502         u32 = read_int
 503
 504         def s32(reader=None):
 505             v = read_int(reader)
 506             if v & 0x80000000 != 0:
 507                 v = - ((v ^ 0xffffffff) + 1)
 508             return v
 509
 510         def s24(reader):
 511             bs = reader.read(3)
 512             assert len(bs) == 3
 513             first_byte = b'\xff' if (ord(bs[0:1]) >= 0x80) else b'\x00'
 514             return struct.unpack('!i', first_byte + bs)
 515
 516         def read_string(reader=None):
 517             if reader is None:
 518                 reader = code_reader
 519             slen = u30(reader)
 520             resb = reader.read(slen)
 521             assert len(resb) == slen
 522             return resb.decode('utf-8')
 523
 524         def read_bytes(count, reader=None):
 525             if reader is None:
 526                 reader = code_reader
 527             resb = reader.read(count)
 528             assert len(resb) == count
 529             return resb
 530
 531         def read_byte(reader=None):
 532             resb = read_bytes(1, reader=reader)
 533             res = struct.unpack('<B', resb)[0]
 534             return res
 535
 536         # minor_version + major_version
 537         read_bytes(2 + 2)
 538
 539         # Constant pool
 540         int_count = u30()
 541         for _c in range(1, int_count):
 542             s32()
 543         uint_count = u30()
 544         for _c in range(1, uint_count):
 545             u32()
 546         double_count = u30()
 547         read_bytes((double_count-1) * 8)
 548         string_count = u30()
 549         constant_strings = [u'']
 550         for _c in range(1, string_count):
 551             s = read_string()
 552             constant_strings.append(s)
 553         namespace_count = u30()
 554         for _c in range(1, namespace_count):
 555             read_bytes(1)  # kind
 556             u30()  # name
 557         ns_set_count = u30()
 558         for _c in range(1, ns_set_count):
 559             count = u30()
 560             for _c2 in range(count):
 561                 u30()
 562         multiname_count = u30()
 563         MULTINAME_SIZES = {
 564             0x07: 2,  # QName
 565             0x0d: 2,  # QNameA
 566             0x0f: 1,  # RTQName
 567             0x10: 1,  # RTQNameA
 568             0x11: 0,  # RTQNameL
 569             0x12: 0,  # RTQNameLA
 570             0x09: 2,  # Multiname
 571             0x0e: 2,  # MultinameA
 572             0x1b: 1,  # MultinameL
 573             0x1c: 1,  # MultinameLA
 574         }
 575         multinames = [u'']
 576         for _c in range(1, multiname_count):
 577             kind = u30()
 578             assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
 579             if kind == 0x07:
 580                 u30()  # namespace_idx
 581                 name_idx = u30()
 582                 multinames.append(constant_strings[name_idx])
 583             else:
 584                 multinames.append('[MULTINAME kind: %d]' % kind)
 585                 for _c2 in range(MULTINAME_SIZES[kind]):
 586                     u30()
 587
 588         # Methods
 589         method_count = u30()
 590         MethodInfo = collections.namedtuple(
 591             'MethodInfo',
 592             ['NEED_ARGUMENTS', 'NEED_REST'])
 593         method_infos = []
 594         for method_id in range(method_count):
 595             param_count = u30()
 596             u30()  # return type
 597             for _ in range(param_count):
 598                 u30()  # param type
 599             u30()  # name index (always 0 for youtube)
 600             flags = read_byte()
 601             if flags & 0x08 != 0:
 602                 # Options present
 603                 option_count = u30()
 604                 for c in range(option_count):
 605                     u30()  # val
 606                     read_bytes(1)  # kind
 607             if flags & 0x80 != 0:
 608                 # Param names present
 609                 for _ in range(param_count):
 610                     u30()  # param name
 611             mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
 612             method_infos.append(mi)
 613
 614         # Metadata
 615         metadata_count = u30()
 616         for _c in range(metadata_count):
 617             u30()  # name
 618             item_count = u30()
 619             for _c2 in range(item_count):
 620                 u30()  # key
 621                 u30()  # value
 622
 623         def parse_traits_info():
 624             trait_name_idx = u30()
 625             kind_full = read_byte()
 626             kind = kind_full & 0x0f
 627             attrs = kind_full >> 4
 628             methods = {}
 629             if kind in [0x00, 0x06]:  # Slot or Const
 630                 u30()  # Slot id
 631                 u30()  # type_name_idx
 632                 vindex = u30()
 633                 if vindex != 0:
 634                     read_byte()  # vkind
 635             elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter
 636                 u30()  # disp_id
 637                 method_idx = u30()
 638                 methods[multinames[trait_name_idx]] = method_idx
 639             elif kind == 0x04:  # Class
 640                 u30()  # slot_id
 641                 u30()  # classi
 642             elif kind == 0x05:  # Function
 643                 u30()  # slot_id
 644                 function_idx = u30()
 645                 methods[function_idx] = multinames[trait_name_idx]
 646             else:
 647                 raise ExtractorError(u'Unsupported trait kind %d' % kind)
 648
 649             if attrs & 0x4 != 0:  # Metadata present
 650                 metadata_count = u30()
 651                 for _c3 in range(metadata_count):
 652                     u30()  # metadata index
 653
 654             return methods
 655
 656         class AVMClass(object):
 657             def __init__(self, name_idx):
 658                 self.name_idx = name_idx
 659                 self.method_names = {}
 660                 self.method_idxs = {}
 661                 self.methods = {}
 662                 self.method_pyfunctions = {}
 663                 self.variables = {}
 664
 665             @property
 666             def name(self):
 667                 return multinames[self.name_idx]
 668
 669         # Classes
 670         class_count = u30()
 671         classes = []
 672         for class_id in range(class_count):
 673             name_idx = u30()
 674             classes.append(AVMClass(name_idx))
 675             u30()  # super_name idx
 676             flags = read_byte()
 677             if flags & 0x08 != 0:  # Protected namespace is present
 678                 u30()  # protected_ns_idx
 679             intrf_count = u30()
 680             for _c2 in range(intrf_count):
 681                 u30()
 682             u30()  # iinit
 683             trait_count = u30()
 684             for _c2 in range(trait_count):
 685                 parse_traits_info()
 686         assert len(classes) == class_count
 687
 688         TARGET_CLASSNAME = u'SignatureDecipher'
 689         searched_class = next(
 690             c for c in classes if c.name == TARGET_CLASSNAME)
 691         if searched_class is None:
 692             raise ExtractorError(u'Target class %r not found' %
 693                                  TARGET_CLASSNAME)
 694
 695         for avm_class in classes:
 696             u30()  # cinit
 697             trait_count = u30()
 698             for _c2 in range(trait_count):
 699                 trait_methods = parse_traits_info()
 700                 avm_class.method_names.update(trait_methods.items())
 701                 avm_class.method_idxs.update(dict(
 702                     (idx, name)
 703                     for name, idx in trait_methods.items()))
 704
 705         # Scripts
 706         script_count = u30()
 707         for _c in range(script_count):
 708             u30()  # init
 709             trait_count = u30()
 710             for _c2 in range(trait_count):
 711                 parse_traits_info()
 712
 713         # Method bodies
 714         method_body_count = u30()
 715         Method = collections.namedtuple('Method', ['code', 'local_count'])
 716         for _c in range(method_body_count):
 717             method_idx = u30()
 718             u30()  # max_stack
 719             local_count = u30()
 720             u30()  # init_scope_depth
 721             u30()  # max_scope_depth
 722             code_length = u30()
 723             code = read_bytes(code_length)
 724             for avm_class in classes:
 725                 if method_idx in avm_class.method_idxs:
 726                     m = Method(code, local_count)
 727                     avm_class.methods[avm_class.method_idxs[method_idx]] = m
 728             exception_count = u30()
 729             for _c2 in range(exception_count):
 730                 u30()  # from
 731                 u30()  # to
 732                 u30()  # target
 733                 u30()  # exc_type
 734                 u30()  # var_name
 735             trait_count = u30()
 736             for _c2 in range(trait_count):
 737                 parse_traits_info()
 738
 739         assert p + code_reader.tell() == len(code_tag)
 740
 741         def extract_function(avm_class, func_name):
 742             if func_name in avm_class.method_pyfunctions:
 743                 return avm_class.method_pyfunctions[func_name]
 744             if func_name not in avm_class.methods:
 745                 raise ExtractorError(u'Cannot find function %r' % func_name)
 746             m = avm_class.methods[func_name]
 747
 748             def resfunc(args):
 749                 registers = ['(this)'] + list(args) + [None] * m.local_count
 750                 stack = []
 751                 coder = io.BytesIO(m.code)
 752                 while True:
 753                     opcode = struct.unpack('!B', coder.read(1))[0]
 754                     if opcode == 17:  # iftrue
 755                         offset = s24(coder)
 756                         value = stack.pop()
 757                         if value:
 758                             coder.seek(coder.tell() + offset)
 759                     elif opcode == 36:  # pushbyte
 760                         v = struct.unpack('!B', coder.read(1))[0]
 761                         stack.append(v)
 762                     elif opcode == 44:  # pushstring
 763                         idx = u30(coder)
 764                         stack.append(constant_strings[idx])
 765                     elif opcode == 48:  # pushscope
 766                         # We don't implement the scope register, so we'll just
 767                         # ignore the popped value
 768                         stack.pop()
 769                     elif opcode == 70:  # callproperty
 770                         index = u30(coder)
 771                         mname = multinames[index]
 772                         arg_count = u30(coder)
 773                         args = list(reversed(
 774                             [stack.pop() for _ in range(arg_count)]))
 775                         obj = stack.pop()
 776                         if mname == u'split':
 777                             assert len(args) == 1
 778                             assert isinstance(args[0], compat_str)
 779                             assert isinstance(obj, compat_str)
 780                             if args[0] == u'':
 781                                 res = list(obj)
 782                             else:
 783                                 res = obj.split(args[0])
 784                             stack.append(res)
 785                         elif mname == u'slice':
 786                             assert len(args) == 1
 787                             assert isinstance(args[0], int)
 788                             assert isinstance(obj, list)
 789                             res = obj[args[0]:]
 790                             stack.append(res)
 791                         elif mname == u'join':
 792                             assert len(args) == 1
 793                             assert isinstance(args[0], compat_str)
 794                             assert isinstance(obj, list)
 795                             res = args[0].join(obj)
 796                             stack.append(res)
 797                         elif mname in avm_class.method_pyfunctions:
 798                             stack.append(avm_class.method_pyfunctions[mname](args))
 799                         else:
 800                             raise NotImplementedError(
 801                                 u'Unsupported property %r on %r'
 802                                 % (mname, obj))
 803                     elif opcode == 72:  # returnvalue
 804                         res = stack.pop()
 805                         return res
 806                     elif opcode == 79:  # callpropvoid
 807                         index = u30(coder)
 808                         mname = multinames[index]
 809                         arg_count = u30(coder)
 810                         args = list(reversed(
 811                             [stack.pop() for _ in range(arg_count)]))
 812                         obj = stack.pop()
 813                         if mname == u'reverse':
 814                             assert isinstance(obj, list)
 815                             obj.reverse()
 816                         else:
 817                             raise NotImplementedError(
 818                                 u'Unsupported (void) property %r on %r'
 819                                 % (mname, obj))
 820                     elif opcode == 86:  # newarray
 821                         arg_count = u30(coder)
 822                         arr = []
 823                         for i in range(arg_count):
 824                             arr.append(stack.pop())
 825                         arr = arr[::-1]
 826                         stack.append(arr)
 827                     elif opcode == 93:  # findpropstrict
 828                         index = u30(coder)
 829                         mname = multinames[index]
 830                         res = extract_function(avm_class, mname)
 831                         stack.append(res)
 832                     elif opcode == 94:  # findproperty
 833                         index = u30(coder)
 834                         mname = multinames[index]
 835                         res = avm_class.variables.get(mname)
 836                         stack.append(res)
 837                     elif opcode == 96:  # getlex
 838                         index = u30(coder)
 839                         mname = multinames[index]
 840                         res = avm_class.variables.get(mname)
 841                         stack.append(res)
 842                     elif opcode == 97:  # setproperty
 843                         index = u30(coder)
 844                         value = stack.pop()
 845                         idx = stack.pop()
 846                         obj = stack.pop()
 847                         assert isinstance(obj, list)
 848                         assert isinstance(idx, int)
 849                         obj[idx] = value
 850                     elif opcode == 98:  # getlocal
 851                         index = u30(coder)
 852                         stack.append(registers[index])
 853                     elif opcode == 99:  # setlocal
 854                         index = u30(coder)
 855                         value = stack.pop()
 856                         registers[index] = value
 857                     elif opcode == 102:  # getproperty
 858                         index = u30(coder)
 859                         pname = multinames[index]
 860                         if pname == u'length':
 861                             obj = stack.pop()
 862                             assert isinstance(obj, list)
 863                             stack.append(len(obj))
 864                         else:  # Assume attribute access
 865                             idx = stack.pop()
 866                             assert isinstance(idx, int)
 867                             obj = stack.pop()
 868                             assert isinstance(obj, list)
 869                             stack.append(obj[idx])
 870                     elif opcode == 128:  # coerce
 871                         u30(coder)
 872                     elif opcode == 133:  # coerce_s
 873                         assert isinstance(stack[-1], (type(None), compat_str))
 874                     elif opcode == 164:  # modulo
 875                         value2 = stack.pop()
 876                         value1 = stack.pop()
 877                         res = value1 % value2
 878                         stack.append(res)
 879                     elif opcode == 175:  # greaterequals
 880                         value2 = stack.pop()
 881                         value1 = stack.pop()
 882                         result = value1 >= value2
 883                         stack.append(result)
 884                     elif opcode == 208:  # getlocal_0
 885                         stack.append(registers[0])
 886                     elif opcode == 209:  # getlocal_1
 887                         stack.append(registers[1])
 888                     elif opcode == 210:  # getlocal_2
 889                         stack.append(registers[2])
 890                     elif opcode == 211:  # getlocal_3
 891                         stack.append(registers[3])
 892                     elif opcode == 214:  # setlocal_2
 893                         registers[2] = stack.pop()
 894                     elif opcode == 215:  # setlocal_3
 895                         registers[3] = stack.pop()
 896                     else:
 897                         raise NotImplementedError(
 898                             u'Unsupported opcode %d' % opcode)
 899
 900             avm_class.method_pyfunctions[func_name] = resfunc
 901             return resfunc
 902
 903         initial_function = extract_function(searched_class, u'decipher')
 904         return lambda s: initial_function([s])
 905
 906     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 907         """Turn the encrypted s field into a working signature"""
 908
 909         if player_url is None:
 910             raise ExtractorError(u'Cannot decrypt signature without player_url')
 911
 912         if player_url.startswith(u'//'):
 913             player_url = u'https:' + player_url
 914         try:
 915             player_id = (player_url, len(s))
 916             if player_id not in self._player_cache:
 917                 func = self._extract_signature_function(
 918                     video_id, player_url, len(s)
 919                 )
 920                 self._player_cache[player_id] = func
 921             func = self._player_cache[player_id]
 922             if self._downloader.params.get('youtube_print_sig_code'):
 923                 self._print_sig_code(func, len(s))
 924             return func(s)
 925         except Exception as e:
 926             tb = traceback.format_exc()
 927             raise ExtractorError(
 928                 u'Automatic signature extraction failed: ' + tb, cause=e)
 929
 930     def _get_available_subtitles(self, video_id, webpage):
 931         try:
 932             sub_list = self._download_webpage(
 933                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 934                 video_id, note=False)
 935         except ExtractorError as err:
 936             self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 937             return {}
 938         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 939
 940         sub_lang_list = {}
 941         for l in lang_list:
 942             lang = l[1]
 943             params = compat_urllib_parse.urlencode({
 944                 'lang': lang,
 945                 'v': video_id,
 946                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 947                 'name': unescapeHTML(l[0]).encode('utf-8'),
 948             })
 949             url = u'https://www.youtube.com/api/timedtext?' + params
 950             sub_lang_list[lang] = url
 951         if not sub_lang_list:
 952             self._downloader.report_warning(u'video doesn\'t have subtitles')
 953             return {}
 954         return sub_lang_list
 955
 956     def _get_available_automatic_caption(self, video_id, webpage):
 957         """We need the webpage for getting the captions url, pass it as an
 958            argument to speed up the process."""
 959         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 960         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 961         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 962         err_msg = u'Couldn\'t find automatic captions for %s' % video_id
 963         if mobj is None:
 964             self._downloader.report_warning(err_msg)
 965             return {}
 966         player_config = json.loads(mobj.group(1))
 967         try:
 968             args = player_config[u'args']
 969             caption_url = args[u'ttsurl']
 970             timestamp = args[u'timestamp']
 971             # We get the available subtitles
 972             list_params = compat_urllib_parse.urlencode({
 973                 'type': 'list',
 974                 'tlangs': 1,
 975                 'asrs': 1,
 976             })
 977             list_url = caption_url + '&' + list_params
 978             caption_list = self._download_xml(list_url, video_id)
 979             original_lang_node = caption_list.find('track')
 980             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
 981                 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
 982                 return {}
 983             original_lang = original_lang_node.attrib['lang_code']
 984
 985             sub_lang_list = {}
 986             for lang_node in caption_list.findall('target'):
 987                 sub_lang = lang_node.attrib['lang_code']
 988                 params = compat_urllib_parse.urlencode({
 989                     'lang': original_lang,
 990                     'tlang': sub_lang,
 991                     'fmt': sub_format,
 992                     'ts': timestamp,
 993                     'kind': 'asr',
 994                 })
 995                 sub_lang_list[sub_lang] = caption_url + '&' + params
 996             return sub_lang_list
 997         # An extractor error can be raise by the download process if there are
 998         # no automatic captions but there are subtitles
 999         except (KeyError, ExtractorError):
1000             self._downloader.report_warning(err_msg)
1001             return {}
1002
1003     @classmethod
1004     def extract_id(cls, url):
1005         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1006         if mobj is None:
1007             raise ExtractorError(u'Invalid URL: %s' % url)
1008         video_id = mobj.group(2)
1009         return video_id
1010
1011     def _extract_from_m3u8(self, manifest_url, video_id):
1012         url_map = {}
1013         def _get_urls(_manifest):
1014             lines = _manifest.split('\n')
1015             urls = filter(lambda l: l and not l.startswith('#'),
1016                             lines)
1017             return urls
1018         manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1019         formats_urls = _get_urls(manifest)
1020         for format_url in formats_urls:
1021             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1022             url_map[itag] = format_url
1023         return url_map
1024
1025     def _extract_annotations(self, video_id):
1026         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1027         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1028
1029     def _real_extract(self, url):
1030         proto = (
1031             u'http' if self._downloader.params.get('prefer_insecure', False)
1032             else u'https')
1033
1034         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1035         mobj = re.search(self._NEXT_URL_RE, url)
1036         if mobj:
1037             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1038         video_id = self.extract_id(url)
1039
1040         # Get video webpage
1041         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1042         video_webpage = self._download_webpage(url, video_id)
1043
1044         # Attempt to extract SWF player URL
1045         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1046         if mobj is not None:
1047             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1048         else:
1049             player_url = None
1050
1051         # Get video info
1052         self.report_video_info_webpage_download(video_id)
1053         if re.search(r'player-age-gate-content">', video_webpage) is not None:
1054             self.report_age_confirmation()
1055             age_gate = True
1056             # We simulate the access to the video from www.youtube.com/v/{video_id}
1057             # this can be viewed without login into Youtube
1058             data = compat_urllib_parse.urlencode({'video_id': video_id,
1059                                                   'el': 'player_embedded',
1060                                                   'gl': 'US',
1061                                                   'hl': 'en',
1062                                                   'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1063                                                   'asv': 3,
1064                                                   'sts':'1588',
1065                                                   })
1066             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1067             video_info_webpage = self._download_webpage(video_info_url, video_id,
1068                                     note=False,
1069                                     errnote='unable to download video info webpage')
1070             video_info = compat_parse_qs(video_info_webpage)
1071         else:
1072             age_gate = False
1073             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1074                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1075                         % (video_id, el_type))
1076                 video_info_webpage = self._download_webpage(video_info_url, video_id,
1077                                         note=False,
1078                                         errnote='unable to download video info webpage')
1079                 video_info = compat_parse_qs(video_info_webpage)
1080                 if 'token' in video_info:
1081                     break
1082         if 'token' not in video_info:
1083             if 'reason' in video_info:
1084                 raise ExtractorError(
1085                     u'YouTube said: %s' % video_info['reason'][0],
1086                     expected=True, video_id=video_id)
1087             else:
1088                 raise ExtractorError(
1089                     u'"token" parameter not in video info for unknown reason',
1090                     video_id=video_id)
1091
1092         if 'view_count' in video_info:
1093             view_count = int(video_info['view_count'][0])
1094         else:
1095             view_count = None
1096
1097         # Check for "rental" videos
1098         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1099             raise ExtractorError(u'"rental" videos not supported')
1100
1101         # Start extracting information
1102         self.report_information_extraction(video_id)
1103
1104         # uploader
1105         if 'author' not in video_info:
1106             raise ExtractorError(u'Unable to extract uploader name')
1107         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1108
1109         # uploader_id
1110         video_uploader_id = None
1111         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1112         if mobj is not None:
1113             video_uploader_id = mobj.group(1)
1114         else:
1115             self._downloader.report_warning(u'unable to extract uploader nickname')
1116
1117         # title
1118         if 'title' in video_info:
1119             video_title = video_info['title'][0]
1120         else:
1121             self._downloader.report_warning(u'Unable to extract video title')
1122             video_title = u'_'
1123
1124         # thumbnail image
1125         # We try first to get a high quality image:
1126         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1127                             video_webpage, re.DOTALL)
1128         if m_thumb is not None:
1129             video_thumbnail = m_thumb.group(1)
1130         elif 'thumbnail_url' not in video_info:
1131             self._downloader.report_warning(u'unable to extract video thumbnail')
1132             video_thumbnail = None
1133         else:   # don't panic if we can't find it
1134             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1135
1136         # upload date
1137         upload_date = None
1138         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
1139         if mobj is None:
1140             mobj = re.search(
1141                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
1142                 video_webpage)
1143         if mobj is not None:
1144             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1145             upload_date = unified_strdate(upload_date)
1146
1147         m_cat_container = get_element_by_id("eow-category", video_webpage)
1148         if m_cat_container:
1149             category = self._html_search_regex(
1150                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1151                 default=None)
1152             video_categories = None if category is None else [category]
1153         else:
1154             video_categories = None
1155
1156         # description
1157         video_description = get_element_by_id("eow-description", video_webpage)
1158         if video_description:
1159             video_description = re.sub(r'''(?x)
1160                 <a\s+
1161                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1162                     title="([^"]+)"\s+
1163                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1164                     class="yt-uix-redirect-link"\s*>
1165                 [^<]+
1166                 </a>
1167             ''', r'\1', video_description)
1168             video_description = clean_html(video_description)
1169         else:
1170             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1171             if fd_mobj:
1172                 video_description = unescapeHTML(fd_mobj.group(1))
1173             else:
1174                 video_description = u''
1175
1176         def _extract_count(klass):
1177             count = self._search_regex(
1178                 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1179                 video_webpage, klass, default=None)
1180             if count is not None:
1181                 return int(count.replace(',', ''))
1182             return None
1183         like_count = _extract_count(u'likes-count')
1184         dislike_count = _extract_count(u'dislikes-count')
1185
1186         # subtitles
1187         video_subtitles = self.extract_subtitles(video_id, video_webpage)
1188
1189         if self._downloader.params.get('listsubtitles', False):
1190             self._list_available_subtitles(video_id, video_webpage)
1191             return
1192
1193         if 'length_seconds' not in video_info:
1194             self._downloader.report_warning(u'unable to extract video duration')
1195             video_duration = None
1196         else:
1197             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1198
1199         # annotations
1200         video_annotations = None
1201         if self._downloader.params.get('writeannotations', False):
1202                 video_annotations = self._extract_annotations(video_id)
1203
1204         # Decide which formats to download
1205         try:
1206             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1207             if not mobj:
1208                 raise ValueError('Could not find vevo ID')
1209             json_code = uppercase_escape(mobj.group(1))
1210             ytplayer_config = json.loads(json_code)
1211             args = ytplayer_config['args']
1212             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1213             # this signatures are encrypted
1214             if 'url_encoded_fmt_stream_map' not in args:
1215                 raise ValueError(u'No stream_map present')  # caught below
1216             re_signature = re.compile(r'[&,]s=')
1217             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1218             if m_s is not None:
1219                 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1220                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1221             m_s = re_signature.search(args.get('adaptive_fmts', u''))
1222             if m_s is not None:
1223                 if 'adaptive_fmts' in video_info:
1224                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1225                 else:
1226                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1227         except ValueError:
1228             pass
1229
1230         def _map_to_format_list(urlmap):
1231             formats = []
1232             for itag, video_real_url in urlmap.items():
1233                 dct = {
1234                     'format_id': itag,
1235                     'url': video_real_url,
1236                     'player_url': player_url,
1237                 }
1238                 if itag in self._formats:
1239                     dct.update(self._formats[itag])
1240                 formats.append(dct)
1241             return formats
1242
1243         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1244             self.report_rtmp_download()
1245             formats = [{
1246                 'format_id': '_rtmp',
1247                 'protocol': 'rtmp',
1248                 'url': video_info['conn'][0],
1249                 'player_url': player_url,
1250             }]
1251         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1252             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1253             if 'rtmpe%3Dyes' in encoded_url_map:
1254                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1255             url_map = {}
1256             for url_data_str in encoded_url_map.split(','):
1257                 url_data = compat_parse_qs(url_data_str)
1258                 if 'itag' in url_data and 'url' in url_data:
1259                     url = url_data['url'][0]
1260                     if 'sig' in url_data:
1261                         url += '&signature=' + url_data['sig'][0]
1262                     elif 's' in url_data:
1263                         encrypted_sig = url_data['s'][0]
1264
1265                         if not age_gate:
1266                             jsplayer_url_json = self._search_regex(
1267                                 r'"assets":.+?"js":\s*("[^"]+")',
1268                                 video_webpage, u'JS player URL')
1269                             player_url = json.loads(jsplayer_url_json)
1270                         if player_url is None:
1271                             player_url_json = self._search_regex(
1272                                 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1273                                 video_webpage, u'age gate player URL')
1274                             player_url = json.loads(player_url_json)
1275
1276                         if self._downloader.params.get('verbose'):
1277                             if player_url is None:
1278                                 player_version = 'unknown'
1279                                 player_desc = 'unknown'
1280                             else:
1281                                 if player_url.endswith('swf'):
1282                                     player_version = self._search_regex(
1283                                         r'-(.+)\.swf$', player_url,
1284                                         u'flash player', fatal=False)
1285                                     player_desc = 'flash player %s' % player_version
1286                                 else:
1287                                     player_version = self._search_regex(
1288                                         r'html5player-(.+?)\.js', video_webpage,
1289                                         'html5 player', fatal=False)
1290                                     player_desc = u'html5 player %s' % player_version
1291
1292                             parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1293                             self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1294                                 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1295
1296                         signature = self._decrypt_signature(
1297                             encrypted_sig, video_id, player_url, age_gate)
1298                         url += '&signature=' + signature
1299                     if 'ratebypass' not in url:
1300                         url += '&ratebypass=yes'
1301                     url_map[url_data['itag'][0]] = url
1302             formats = _map_to_format_list(url_map)
1303         elif video_info.get('hlsvp'):
1304             manifest_url = video_info['hlsvp'][0]
1305             url_map = self._extract_from_m3u8(manifest_url, video_id)
1306             formats = _map_to_format_list(url_map)
1307         else:
1308             raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1309
1310         # Look for the DASH manifest
1311         if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1312             try:
1313                 # The DASH manifest used needs to be the one from the original video_webpage.
1314                 # The one found in get_video_info seems to be using different signatures.
1315                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1316                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1317                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1318                 if age_gate:
1319                     dash_manifest_url = video_info.get('dashmpd')[0]
1320                 else:
1321                     dash_manifest_url = ytplayer_config['args']['dashmpd']
1322                 def decrypt_sig(mobj):
1323                     s = mobj.group(1)
1324                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1325                     return '/signature/%s' % dec_s
1326                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1327                 dash_doc = self._download_xml(
1328                     dash_manifest_url, video_id,
1329                     note=u'Downloading DASH manifest',
1330                     errnote=u'Could not download DASH manifest')
1331                 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1332                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1333                     if url_el is None:
1334                         continue
1335                     format_id = r.attrib['id']
1336                     video_url = url_el.text
1337                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1338                     f = {
1339                         'format_id': format_id,
1340                         'url': video_url,
1341                         'width': int_or_none(r.attrib.get('width')),
1342                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1343                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1344                         'filesize': filesize,
1345                     }
1346                     try:
1347                         existing_format = next(
1348                             fo for fo in formats
1349                             if fo['format_id'] == format_id)
1350                     except StopIteration:
1351                         f.update(self._formats.get(format_id, {}))
1352                         formats.append(f)
1353                     else:
1354                         existing_format.update(f)
1355
1356             except (ExtractorError, KeyError) as e:
1357                 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1358
1359         self._sort_formats(formats)
1360
1361         return {
1362             'id':           video_id,
1363             'uploader':     video_uploader,
1364             'uploader_id':  video_uploader_id,
1365             'upload_date':  upload_date,
1366             'title':        video_title,
1367             'thumbnail':    video_thumbnail,
1368             'description':  video_description,
1369             'categories':   video_categories,
1370             'subtitles':    video_subtitles,
1371             'duration':     video_duration,
1372             'age_limit':    18 if age_gate else 0,
1373             'annotations':  video_annotations,
1374             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1375             'view_count':   view_count,
1376             'like_count': like_count,
1377             'dislike_count': dislike_count,
1378             'formats':      formats,
1379         }
1380
1381 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1382     IE_DESC = u'YouTube.com playlists'
1383     _VALID_URL = r"""(?x)(?:
1384                         (?:https?://)?
1385                         (?:\w+\.)?
1386                         youtube\.com/
1387                         (?:
1388                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1389                            \? (?:.*?&)*? (?:p|a|list)=
1390                         |  p/
1391                         )
1392                         (
1393                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1394                             # Top tracks, they can also include dots
1395                             |(?:MC)[\w\.]*
1396                         )
1397                         .*
1398                      |
1399                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1400                      )"""
1401     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1402     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1403     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1404     IE_NAME = u'youtube:playlist'
1405
1406     def _real_initialize(self):
1407         self._login()
1408
1409     def _ids_to_results(self, ids):
1410         return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1411                        for vid_id in ids]
1412
1413     def _extract_mix(self, playlist_id):
1414         # The mixes are generated from a a single video
1415         # the id of the playlist is just 'RD' + video_id
1416         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1417         webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1418         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1419         title_span = (search_title('playlist-title') or
1420             search_title('title long-title') or search_title('title'))
1421         title = clean_html(title_span)
1422         video_re = r'''(?x)data-video-username=".*?".*?
1423                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1424         ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1425         url_results = self._ids_to_results(ids)
1426
1427         return self.playlist_result(url_results, playlist_id, title)
1428
1429     def _real_extract(self, url):
1430         # Extract playlist id
1431         mobj = re.match(self._VALID_URL, url)
1432         if mobj is None:
1433             raise ExtractorError(u'Invalid URL: %s' % url)
1434         playlist_id = mobj.group(1) or mobj.group(2)
1435
1436         # Check if it's a video-specific URL
1437         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1438         if 'v' in query_dict:
1439             video_id = query_dict['v'][0]
1440             if self._downloader.params.get('noplaylist'):
1441                 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1442                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1443             else:
1444                 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1445
1446         if playlist_id.startswith('RD'):
1447             # Mixes require a custom extraction process
1448             return self._extract_mix(playlist_id)
1449         if playlist_id.startswith('TL'):
1450             raise ExtractorError(u'For downloading YouTube.com top lists, use '
1451                 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1452
1453         url = self._TEMPLATE_URL % playlist_id
1454         page = self._download_webpage(url, playlist_id)
1455         more_widget_html = content_html = page
1456
1457         # Check if the playlist exists or is private
1458         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1459             raise ExtractorError(
1460                 u'The playlist doesn\'t exist or is private, use --username or '
1461                 '--netrc to access it.',
1462                 expected=True)
1463
1464         # Extract the video ids from the playlist pages
1465         ids = []
1466
1467         for page_num in itertools.count(1):
1468             matches = re.finditer(self._VIDEO_RE, content_html)
1469             # We remove the duplicates and the link with index 0
1470             # (it's not the first video of the playlist)
1471             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1472             ids.extend(new_ids)
1473
1474             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1475             if not mobj:
1476                 break
1477
1478             more = self._download_json(
1479                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1480                 'Downloading page #%s' % page_num,
1481                 transform_source=uppercase_escape)
1482             content_html = more['content_html']
1483             more_widget_html = more['load_more_widget_html']
1484
1485         playlist_title = self._html_search_regex(
1486             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1487             page, u'title')
1488
1489         url_results = self._ids_to_results(ids)
1490         return self.playlist_result(url_results, playlist_id, playlist_title)
1491
1492
1493 class YoutubeTopListIE(YoutubePlaylistIE):
1494     IE_NAME = u'youtube:toplist'
1495     IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1496         u' (Example: "yttoplist:music:Top Tracks")')
1497     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1498
1499     def _real_extract(self, url):
1500         mobj = re.match(self._VALID_URL, url)
1501         channel = mobj.group('chann')
1502         title = mobj.group('title')
1503         query = compat_urllib_parse.urlencode({'title': title})
1504         playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1505         channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1506         link = self._html_search_regex(playlist_re, channel_page, u'list')
1507         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1508
1509         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1510         ids = []
1511         # sometimes the webpage doesn't contain the videos
1512         # retry until we get them
1513         for i in itertools.count(0):
1514             msg = u'Downloading Youtube mix'
1515             if i > 0:
1516                 msg += ', retry #%d' % i
1517             webpage = self._download_webpage(url, title, msg)
1518             ids = orderedSet(re.findall(video_re, webpage))
1519             if ids:
1520                 break
1521         url_results = self._ids_to_results(ids)
1522         return self.playlist_result(url_results, playlist_title=title)
1523
1524
1525 class YoutubeChannelIE(InfoExtractor):
1526     IE_DESC = u'YouTube.com channels'
1527     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1528     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1529     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1530     IE_NAME = u'youtube:channel'
1531
1532     def extract_videos_from_page(self, page):
1533         ids_in_page = []
1534         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1535             if mobj.group(1) not in ids_in_page:
1536                 ids_in_page.append(mobj.group(1))
1537         return ids_in_page
1538
1539     def _real_extract(self, url):
1540         # Extract channel id
1541         mobj = re.match(self._VALID_URL, url)
1542         if mobj is None:
1543             raise ExtractorError(u'Invalid URL: %s' % url)
1544
1545         # Download channel page
1546         channel_id = mobj.group(1)
1547         video_ids = []
1548         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1549         channel_page = self._download_webpage(url, channel_id)
1550         autogenerated = re.search(r'''(?x)
1551                 class="[^"]*?(?:
1552                     channel-header-autogenerated-label|
1553                     yt-channel-title-autogenerated
1554                 )[^"]*"''', channel_page) is not None
1555
1556         if autogenerated:
1557             # The videos are contained in a single page
1558             # the ajax pages can't be used, they are empty
1559             video_ids = self.extract_videos_from_page(channel_page)
1560         else:
1561             # Download all channel pages using the json-based channel_ajax query
1562             for pagenum in itertools.count(1):
1563                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1564                 page = self._download_json(
1565                     url, channel_id, note=u'Downloading page #%s' % pagenum,
1566                     transform_source=uppercase_escape)
1567
1568                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1569                 video_ids.extend(ids_in_page)
1570
1571                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1572                     break
1573
1574         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1575
1576         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1577                        for video_id in video_ids]
1578         return self.playlist_result(url_entries, channel_id)
1579
1580
1581 class YoutubeUserIE(InfoExtractor):
1582     IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1583     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1584     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1585     _GDATA_PAGE_SIZE = 50
1586     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1587     IE_NAME = u'youtube:user'
1588
1589     @classmethod
1590     def suitable(cls, url):
1591         # Don't return True if the url can be extracted with other youtube
1592         # extractor, the regex would is too permissive and it would match.
1593         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1594         if any(ie.suitable(url) for ie in other_ies): return False
1595         else: return super(YoutubeUserIE, cls).suitable(url)
1596
1597     def _real_extract(self, url):
1598         # Extract username
1599         mobj = re.match(self._VALID_URL, url)
1600         if mobj is None:
1601             raise ExtractorError(u'Invalid URL: %s' % url)
1602
1603         username = mobj.group(1)
1604
1605         # Download video ids using YouTube Data API. Result size per
1606         # query is limited (currently to 50 videos) so we need to query
1607         # page by page until there are no video ids - it means we got
1608         # all of them.
1609
1610         def download_page(pagenum):
1611             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1612
1613             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1614             page = self._download_webpage(
1615                 gdata_url, username,
1616                 u'Downloading video ids from %d to %d' % (
1617                     start_index, start_index + self._GDATA_PAGE_SIZE))
1618
1619             try:
1620                 response = json.loads(page)
1621             except ValueError as err:
1622                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1623             if 'entry' not in response['feed']:
1624                 return
1625
1626             # Extract video identifiers
1627             entries = response['feed']['entry']
1628             for entry in entries:
1629                 title = entry['title']['$t']
1630                 video_id = entry['id']['$t'].split('/')[-1]
1631                 yield {
1632                     '_type': 'url',
1633                     'url': video_id,
1634                     'ie_key': 'Youtube',
1635                     'id': video_id,
1636                     'title': title,
1637                 }
1638         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1639
1640         return self.playlist_result(url_results, playlist_title=username)
1641
1642
1643 class YoutubeSearchIE(SearchInfoExtractor):
1644     IE_DESC = u'YouTube.com searches'
1645     _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1646     _MAX_RESULTS = 1000
1647     IE_NAME = u'youtube:search'
1648     _SEARCH_KEY = 'ytsearch'
1649
1650     def _get_n_results(self, query, n):
1651         """Get a specified number of results for a query"""
1652
1653         video_ids = []
1654         pagenum = 0
1655         limit = n
1656         PAGE_SIZE = 50
1657
1658         while (PAGE_SIZE * pagenum) < limit:
1659             result_url = self._API_URL % (
1660                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1661                 (PAGE_SIZE * pagenum) + 1)
1662             data_json = self._download_webpage(
1663                 result_url, video_id=u'query "%s"' % query,
1664                 note=u'Downloading page %s' % (pagenum + 1),
1665                 errnote=u'Unable to download API page')
1666             data = json.loads(data_json)
1667             api_response = data['data']
1668
1669             if 'items' not in api_response:
1670                 raise ExtractorError(
1671                     u'[youtube] No video results', expected=True)
1672
1673             new_ids = list(video['id'] for video in api_response['items'])
1674             video_ids += new_ids
1675
1676             limit = min(n, api_response['totalItems'])
1677             pagenum += 1
1678
1679         if len(video_ids) > n:
1680             video_ids = video_ids[:n]
1681         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1682                   for video_id in video_ids]
1683         return self.playlist_result(videos, query)
1684
1685
1686 class YoutubeSearchDateIE(YoutubeSearchIE):
1687     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1688     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1689     _SEARCH_KEY = 'ytsearchdate'
1690     IE_DESC = u'YouTube.com searches, newest videos first'
1691
1692
1693 class YoutubeSearchURLIE(InfoExtractor):
1694     IE_DESC = u'YouTube.com search URLs'
1695     IE_NAME = u'youtube:search_url'
1696     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1697
1698     def _real_extract(self, url):
1699         mobj = re.match(self._VALID_URL, url)
1700         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1701
1702         webpage = self._download_webpage(url, query)
1703         result_code = self._search_regex(
1704             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
1705
1706         part_codes = re.findall(
1707             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1708         entries = []
1709         for part_code in part_codes:
1710             part_title = self._html_search_regex(
1711                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1712             part_url_snippet = self._html_search_regex(
1713                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1714             part_url = compat_urlparse.urljoin(
1715                 'https://www.youtube.com/', part_url_snippet)
1716             entries.append({
1717                 '_type': 'url',
1718                 'url': part_url,
1719                 'title': part_title,
1720             })
1721
1722         return {
1723             '_type': 'playlist',
1724             'entries': entries,
1725             'title': query,
1726         }
1727
1728
1729 class YoutubeShowIE(InfoExtractor):
1730     IE_DESC = u'YouTube.com (multi-season) shows'
1731     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1732     IE_NAME = u'youtube:show'
1733
1734     def _real_extract(self, url):
1735         mobj = re.match(self._VALID_URL, url)
1736         show_name = mobj.group(1)
1737         webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1738         # There's one playlist for each season of the show
1739         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1740         self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1741         return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1742
1743
1744 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1745     """
1746     Base class for extractors that fetch info from
1747     http://www.youtube.com/feed_ajax
1748     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1749     """
1750     _LOGIN_REQUIRED = True
1751     # use action_load_personal_feed instead of action_load_system_feed
1752     _PERSONAL_FEED = False
1753
1754     @property
1755     def _FEED_TEMPLATE(self):
1756         action = 'action_load_system_feed'
1757         if self._PERSONAL_FEED:
1758             action = 'action_load_personal_feed'
1759         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1760
1761     @property
1762     def IE_NAME(self):
1763         return u'youtube:%s' % self._FEED_NAME
1764
1765     def _real_initialize(self):
1766         self._login()
1767
1768     def _real_extract(self, url):
1769         feed_entries = []
1770         paging = 0
1771         for i in itertools.count(1):
1772             info = self._download_json(self._FEED_TEMPLATE % paging,
1773                                           u'%s feed' % self._FEED_NAME,
1774                                           u'Downloading page %s' % i)
1775             feed_html = info.get('feed_html') or info.get('content_html')
1776             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1777             ids = orderedSet(m.group(1) for m in m_ids)
1778             feed_entries.extend(
1779                 self.url_result(video_id, 'Youtube', video_id=video_id)
1780                 for video_id in ids)
1781             mobj = re.search(
1782                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1783                 feed_html)
1784             if mobj is None:
1785                 break
1786             paging = mobj.group('paging')
1787         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1788
1789 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1790     IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1791     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1792     _FEED_NAME = 'subscriptions'
1793     _PLAYLIST_TITLE = u'Youtube Subscriptions'
1794
1795 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1796     IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1797     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1798     _FEED_NAME = 'recommended'
1799     _PLAYLIST_TITLE = u'Youtube Recommended videos'
1800
1801 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1802     IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1803     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1804     _FEED_NAME = 'watch_later'
1805     _PLAYLIST_TITLE = u'Youtube Watch Later'
1806     _PERSONAL_FEED = True
1807
1808 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1809     IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1810     _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1811     _FEED_NAME = 'history'
1812     _PERSONAL_FEED = True
1813     _PLAYLIST_TITLE = u'Youtube Watch History'
1814
1815 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1816     IE_NAME = u'youtube:favorites'
1817     IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1818     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1819     _LOGIN_REQUIRED = True
1820
1821     def _real_extract(self, url):
1822         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1823         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1824         return self.url_result(playlist_id, 'YoutubePlaylist')
1825
1826
1827 class YoutubeTruncatedURLIE(InfoExtractor):
1828     IE_NAME = 'youtube:truncated_url'
1829     IE_DESC = False  # Do not list
1830     _VALID_URL = r'''(?x)
1831         (?:https?://)?[^/]+/watch\?(?:
1832             feature=[a-z_]+|
1833             annotation_id=annotation_[^&]+
1834         )?$|
1835         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1836     '''
1837
1838     _TESTS = [{
1839         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1840         'only_matching': True,
1841     }, {
1842         'url': 'http://www.youtube.com/watch?',
1843         'only_matching': True,
1844     }]
1845
1846     def _real_extract(self, url):
1847         raise ExtractorError(
1848             u'Did you forget to quote the URL? Remember that & is a meta '
1849             u'character in most shells, so you want to put the URL in quotes, '
1850             u'like  youtube-dl '
1851             u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1852             u' or simply  youtube-dl BaW_jenozKc  .',
1853             expected=True)