_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 import collections
   4 import errno
   5 import io
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import string
  11 import struct
  12 import traceback
  13 import zlib
  14
  15 from .common import InfoExtractor, SearchInfoExtractor
  16 from .subtitles import SubtitlesInfoExtractor
  17 from ..jsinterp import JSInterpreter
  18 from ..utils import (
  19     compat_chr,
  20     compat_parse_qs,
  21     compat_urllib_parse,
  22     compat_urllib_request,
  23     compat_urlparse,
  24     compat_str,
  25
  26     clean_html,
  27     get_cachedir,
  28     get_element_by_id,
  29     get_element_by_attribute,
  30     ExtractorError,
  31     int_or_none,
  32     PagedList,
  33     unescapeHTML,
  34     unified_strdate,
  35     orderedSet,
  36     write_json_file,
  37     uppercase_escape,
  38 )
  39
  40 class YoutubeBaseInfoExtractor(InfoExtractor):
  41     """Provide base functions for Youtube extractors"""
  42     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  43     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  44     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  45     _NETRC_MACHINE = 'youtube'
  46     # If True it will raise an error if no login info is provided
  47     _LOGIN_REQUIRED = False
  48
  49     def _set_language(self):
  50         return bool(self._download_webpage(
  51             self._LANG_URL, None,
  52             note=u'Setting language', errnote='unable to set language',
  53             fatal=False))
  54
  55     def _login(self):
  56         (username, password) = self._get_login_info()
  57         # No authentication to be performed
  58         if username is None:
  59             if self._LOGIN_REQUIRED:
  60                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  61             return False
  62
  63         login_page = self._download_webpage(
  64             self._LOGIN_URL, None,
  65             note=u'Downloading login page',
  66             errnote=u'unable to fetch login page', fatal=False)
  67         if login_page is False:
  68             return
  69
  70         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  71                                   login_page, u'Login GALX parameter')
  72
  73         # Log in
  74         login_form_strs = {
  75                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  76                 u'Email': username,
  77                 u'GALX': galx,
  78                 u'Passwd': password,
  79                 u'PersistentCookie': u'yes',
  80                 u'_utf8': u'霱',
  81                 u'bgresponse': u'js_disabled',
  82                 u'checkConnection': u'',
  83                 u'checkedDomains': u'youtube',
  84                 u'dnConn': u'',
  85                 u'pstMsg': u'0',
  86                 u'rmShown': u'1',
  87                 u'secTok': u'',
  88                 u'signIn': u'Sign in',
  89                 u'timeStmp': u'',
  90                 u'service': u'youtube',
  91                 u'uilel': u'3',
  92                 u'hl': u'en_US',
  93         }
  94         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
  95         # chokes on unicode
  96         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
  97         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
  98
  99         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 100         login_results = self._download_webpage(
 101             req, None,
 102             note=u'Logging in', errnote=u'unable to log in', fatal=False)
 103         if login_results is False:
 104             return False
 105         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 106             self._downloader.report_warning(u'unable to log in: bad username or password')
 107             return False
 108         return True
 109
 110     def _confirm_age(self):
 111         age_form = {
 112             'next_url': '/',
 113             'action_confirm': 'Confirm',
 114         }
 115         req = compat_urllib_request.Request(self._AGE_URL,
 116             compat_urllib_parse.urlencode(age_form).encode('ascii'))
 117
 118         self._download_webpage(
 119             req, None,
 120             note=u'Confirming age', errnote=u'Unable to confirm age')
 121         return True
 122
 123     def _real_initialize(self):
 124         if self._downloader is None:
 125             return
 126         if not self._set_language():
 127             return
 128         if not self._login():
 129             return
 130         self._confirm_age()
 131
 132
 133 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 134     IE_DESC = u'YouTube.com'
 135     _VALID_URL = r"""(?x)^
 136                      (
 137                          (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
 138                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 139                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 140                             (?:www\.)?pwnyoutube\.com/|
 141                             (?:www\.)?yourepeat\.com/|
 142                             tube\.majestyc\.net/|
 143                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 144                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 145                          (?:                                                  # the various things that can precede the ID:
 146                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 147                              |(?:                                             # or the v= param in all its forms
 148                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 149                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 150                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 151                                  v=
 152                              )
 153                          ))
 154                          |youtu\.be/                                          # just youtu.be/xxxx
 155                          )
 156                      )?                                                       # all until now is optional -> you can pass the naked ID
 157                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 158                      (?(1).+)?                                                # if we found the ID, everything can follow
 159                      $"""
 160     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 161     _formats = {
 162         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 163         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 164         '13': {'ext': '3gp'},
 165         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 166         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 167         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 168         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 169         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 170         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 171         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 172         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 173         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 174         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 175         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 176         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 177
 178
 179         # 3d videos
 180         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 181         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 182         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 183         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 184         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 185         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 186         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 187
 188         # Apple HTTP Live Streaming
 189         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 190         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 191         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 192         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 193         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 194         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 195         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 196
 197         # DASH mp4 video
 198         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 199         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 200         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 201         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 202         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 203         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 204         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 205         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 206
 207         # Dash mp4 audio
 208         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 209         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 210         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 211
 212         # Dash webm
 213         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
 214         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
 215         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
 216         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
 217         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
 218         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
 219         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH webm', 'preference': -40},
 220         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH webm', 'preference': -40},
 221         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
 222         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
 223         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
 224         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH webm', 'preference': -40},
 225         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH webm', 'preference': -40},
 226
 227         # Dash webm audio
 228         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
 229         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
 230
 231         # RTMP (unnamed)
 232         '_rtmp': {'protocol': 'rtmp'},
 233     }
 234
 235     IE_NAME = u'youtube'
 236     _TESTS = [
 237         {
 238             u"url":  u"http://www.youtube.com/watch?v=BaW_jenozKc",
 239             u"file":  u"BaW_jenozKc.mp4",
 240             u"info_dict": {
 241                 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
 242                 u"uploader": u"Philipp Hagemeister",
 243                 u"uploader_id": u"phihag",
 244                 u"upload_date": u"20121002",
 245                 u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
 246             }
 247         },
 248         {
 249             u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
 250             u"file":  u"UxxajLWwzqY.mp4",
 251             u"note": u"Test generic use_cipher_signature video (#897)",
 252             u"info_dict": {
 253                 u"upload_date": u"20120506",
 254                 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
 255                 u"description": u"md5:5b292926389560516e384ac437c0ec07",
 256                 u"uploader": u"Icona Pop",
 257                 u"uploader_id": u"IconaPop"
 258             }
 259         },
 260         {
 261             u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
 262             u"file":  u"07FYdnEawAQ.mp4",
 263             u"note": u"Test VEVO video with age protection (#956)",
 264             u"info_dict": {
 265                 u"upload_date": u"20130703",
 266                 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
 267                 u"description": u"md5:64249768eec3bc4276236606ea996373",
 268                 u"uploader": u"justintimberlakeVEVO",
 269                 u"uploader_id": u"justintimberlakeVEVO"
 270             }
 271         },
 272         {
 273             u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
 274             u"file":  u"yZIXLfi8CZQ.mp4",
 275             u"note": u"Embed-only video (#1746)",
 276             u"info_dict": {
 277                 u"upload_date": u"20120608",
 278                 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
 279                 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
 280                 u"uploader": u"SET India",
 281                 u"uploader_id": u"setindia"
 282             }
 283         },
 284         {
 285             u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
 286             u"file": u"a9LDPn-MO4I.m4a",
 287             u"note": u"256k DASH audio (format 141) via DASH manifest",
 288             u"info_dict": {
 289                 u"upload_date": "20121002",
 290                 u"uploader_id": "8KVIDEO",
 291                 u"description": "No description available.",
 292                 u"uploader": "8KVIDEO",
 293                 u"title": "UHDTV TEST 8K VIDEO.mp4"
 294             },
 295             u"params": {
 296                 u"youtube_include_dash_manifest": True,
 297                 u"format": "141",
 298             },
 299         },
 300         # DASH manifest with encrypted signature
 301         {
 302             u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 303             u'info_dict': {
 304                 u'id': u'IB3lcPjvWLA',
 305                 u'ext': u'm4a',
 306                 u'title': u'Afrojack - The Spark ft. Spree Wilson',
 307                 u'description': u'md5:3199ed45ee8836572865580804d7ac0f',
 308                 u'uploader': u'AfrojackVEVO',
 309                 u'uploader_id': u'AfrojackVEVO',
 310                 u'upload_date': u'20131011',
 311             },
 312             u"params": {
 313                 u'youtube_include_dash_manifest': True,
 314                 u'format': '141',
 315             },
 316         },
 317     ]
 318
 319
 320     @classmethod
 321     def suitable(cls, url):
 322         """Receives a URL and returns True if suitable for this IE."""
 323         if YoutubePlaylistIE.suitable(url): return False
 324         return re.match(cls._VALID_URL, url) is not None
 325
 326     def __init__(self, *args, **kwargs):
 327         super(YoutubeIE, self).__init__(*args, **kwargs)
 328         self._player_cache = {}
 329
 330     def report_video_info_webpage_download(self, video_id):
 331         """Report attempt to download video info webpage."""
 332         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 333
 334     def report_information_extraction(self, video_id):
 335         """Report attempt to extract video information."""
 336         self.to_screen(u'%s: Extracting video information' % video_id)
 337
 338     def report_unavailable_format(self, video_id, format):
 339         """Report extracted video URL."""
 340         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 341
 342     def report_rtmp_download(self):
 343         """Indicate the download will use the RTMP protocol."""
 344         self.to_screen(u'RTMP download detected')
 345
 346     def _extract_signature_function(self, video_id, player_url, slen):
 347         id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
 348                         player_url)
 349         player_type = id_m.group('ext')
 350         player_id = id_m.group('id')
 351
 352         # Read from filesystem cache
 353         func_id = '%s_%s_%d' % (player_type, player_id, slen)
 354         assert os.path.basename(func_id) == func_id
 355         cache_dir = get_cachedir(self._downloader.params)
 356
 357         cache_enabled = cache_dir is not None
 358         if cache_enabled:
 359             cache_fn = os.path.join(os.path.expanduser(cache_dir),
 360                                     u'youtube-sigfuncs',
 361                                     func_id + '.json')
 362             try:
 363                 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
 364                     cache_spec = json.load(cachef)
 365                 return lambda s: u''.join(s[i] for i in cache_spec)
 366             except IOError:
 367                 pass  # No cache available
 368
 369         if player_type == 'js':
 370             code = self._download_webpage(
 371                 player_url, video_id,
 372                 note=u'Downloading %s player %s' % (player_type, player_id),
 373                 errnote=u'Download of %s failed' % player_url)
 374             res = self._parse_sig_js(code)
 375         elif player_type == 'swf':
 376             urlh = self._request_webpage(
 377                 player_url, video_id,
 378                 note=u'Downloading %s player %s' % (player_type, player_id),
 379                 errnote=u'Download of %s failed' % player_url)
 380             code = urlh.read()
 381             res = self._parse_sig_swf(code)
 382         else:
 383             assert False, 'Invalid player type %r' % player_type
 384
 385         if cache_enabled:
 386             try:
 387                 test_string = u''.join(map(compat_chr, range(slen)))
 388                 cache_res = res(test_string)
 389                 cache_spec = [ord(c) for c in cache_res]
 390                 try:
 391                     os.makedirs(os.path.dirname(cache_fn))
 392                 except OSError as ose:
 393                     if ose.errno != errno.EEXIST:
 394                         raise
 395                 write_json_file(cache_spec, cache_fn)
 396             except Exception:
 397                 tb = traceback.format_exc()
 398                 self._downloader.report_warning(
 399                     u'Writing cache to %r failed: %s' % (cache_fn, tb))
 400
 401         return res
 402
 403     def _print_sig_code(self, func, slen):
 404         def gen_sig_code(idxs):
 405             def _genslice(start, end, step):
 406                 starts = u'' if start == 0 else str(start)
 407                 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
 408                 steps = u'' if step == 1 else (u':%d' % step)
 409                 return u's[%s%s%s]' % (starts, ends, steps)
 410
 411             step = None
 412             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 413                                     # set as soon as step is set
 414             for i, prev in zip(idxs[1:], idxs[:-1]):
 415                 if step is not None:
 416                     if i - prev == step:
 417                         continue
 418                     yield _genslice(start, prev, step)
 419                     step = None
 420                     continue
 421                 if i - prev in [-1, 1]:
 422                     step = i - prev
 423                     start = prev
 424                     continue
 425                 else:
 426                     yield u's[%d]' % prev
 427             if step is None:
 428                 yield u's[%d]' % i
 429             else:
 430                 yield _genslice(start, i, step)
 431
 432         test_string = u''.join(map(compat_chr, range(slen)))
 433         cache_res = func(test_string)
 434         cache_spec = [ord(c) for c in cache_res]
 435         expr_code = u' + '.join(gen_sig_code(cache_spec))
 436         code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code)
 437         self.to_screen(u'Extracted signature function:\n' + code)
 438
 439     def _parse_sig_js(self, jscode):
 440         funcname = self._search_regex(
 441             r'signature=([a-zA-Z]+)', jscode,
 442              u'Initial JS player signature function name')
 443
 444         jsi = JSInterpreter(jscode)
 445         initial_function = jsi.extract_function(funcname)
 446         return lambda s: initial_function([s])
 447
 448     def _parse_sig_swf(self, file_contents):
 449         if file_contents[1:3] != b'WS':
 450             raise ExtractorError(
 451                 u'Not an SWF file; header is %r' % file_contents[:3])
 452         if file_contents[:1] == b'C':
 453             content = zlib.decompress(file_contents[8:])
 454         else:
 455             raise NotImplementedError(u'Unsupported compression format %r' %
 456                                       file_contents[:1])
 457
 458         def extract_tags(content):
 459             pos = 0
 460             while pos < len(content):
 461                 header16 = struct.unpack('<H', content[pos:pos+2])[0]
 462                 pos += 2
 463                 tag_code = header16 >> 6
 464                 tag_len = header16 & 0x3f
 465                 if tag_len == 0x3f:
 466                     tag_len = struct.unpack('<I', content[pos:pos+4])[0]
 467                     pos += 4
 468                 assert pos+tag_len <= len(content)
 469                 yield (tag_code, content[pos:pos+tag_len])
 470                 pos += tag_len
 471
 472         code_tag = next(tag
 473                         for tag_code, tag in extract_tags(content)
 474                         if tag_code == 82)
 475         p = code_tag.index(b'\0', 4) + 1
 476         code_reader = io.BytesIO(code_tag[p:])
 477
 478         # Parse ABC (AVM2 ByteCode)
 479         def read_int(reader=None):
 480             if reader is None:
 481                 reader = code_reader
 482             res = 0
 483             shift = 0
 484             for _ in range(5):
 485                 buf = reader.read(1)
 486                 assert len(buf) == 1
 487                 b = struct.unpack('<B', buf)[0]
 488                 res = res | ((b & 0x7f) << shift)
 489                 if b & 0x80 == 0:
 490                     break
 491                 shift += 7
 492             return res
 493
 494         def u30(reader=None):
 495             res = read_int(reader)
 496             assert res & 0xf0000000 == 0
 497             return res
 498         u32 = read_int
 499
 500         def s32(reader=None):
 501             v = read_int(reader)
 502             if v & 0x80000000 != 0:
 503                 v = - ((v ^ 0xffffffff) + 1)
 504             return v
 505
 506         def read_string(reader=None):
 507             if reader is None:
 508                 reader = code_reader
 509             slen = u30(reader)
 510             resb = reader.read(slen)
 511             assert len(resb) == slen
 512             return resb.decode('utf-8')
 513
 514         def read_bytes(count, reader=None):
 515             if reader is None:
 516                 reader = code_reader
 517             resb = reader.read(count)
 518             assert len(resb) == count
 519             return resb
 520
 521         def read_byte(reader=None):
 522             resb = read_bytes(1, reader=reader)
 523             res = struct.unpack('<B', resb)[0]
 524             return res
 525
 526         # minor_version + major_version
 527         read_bytes(2 + 2)
 528
 529         # Constant pool
 530         int_count = u30()
 531         for _c in range(1, int_count):
 532             s32()
 533         uint_count = u30()
 534         for _c in range(1, uint_count):
 535             u32()
 536         double_count = u30()
 537         read_bytes((double_count-1) * 8)
 538         string_count = u30()
 539         constant_strings = [u'']
 540         for _c in range(1, string_count):
 541             s = read_string()
 542             constant_strings.append(s)
 543         namespace_count = u30()
 544         for _c in range(1, namespace_count):
 545             read_bytes(1)  # kind
 546             u30()  # name
 547         ns_set_count = u30()
 548         for _c in range(1, ns_set_count):
 549             count = u30()
 550             for _c2 in range(count):
 551                 u30()
 552         multiname_count = u30()
 553         MULTINAME_SIZES = {
 554             0x07: 2,  # QName
 555             0x0d: 2,  # QNameA
 556             0x0f: 1,  # RTQName
 557             0x10: 1,  # RTQNameA
 558             0x11: 0,  # RTQNameL
 559             0x12: 0,  # RTQNameLA
 560             0x09: 2,  # Multiname
 561             0x0e: 2,  # MultinameA
 562             0x1b: 1,  # MultinameL
 563             0x1c: 1,  # MultinameLA
 564         }
 565         multinames = [u'']
 566         for _c in range(1, multiname_count):
 567             kind = u30()
 568             assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
 569             if kind == 0x07:
 570                 u30()  # namespace_idx
 571                 name_idx = u30()
 572                 multinames.append(constant_strings[name_idx])
 573             else:
 574                 multinames.append('[MULTINAME kind: %d]' % kind)
 575                 for _c2 in range(MULTINAME_SIZES[kind]):
 576                     u30()
 577
 578         # Methods
 579         method_count = u30()
 580         MethodInfo = collections.namedtuple(
 581             'MethodInfo',
 582             ['NEED_ARGUMENTS', 'NEED_REST'])
 583         method_infos = []
 584         for method_id in range(method_count):
 585             param_count = u30()
 586             u30()  # return type
 587             for _ in range(param_count):
 588                 u30()  # param type
 589             u30()  # name index (always 0 for youtube)
 590             flags = read_byte()
 591             if flags & 0x08 != 0:
 592                 # Options present
 593                 option_count = u30()
 594                 for c in range(option_count):
 595                     u30()  # val
 596                     read_bytes(1)  # kind
 597             if flags & 0x80 != 0:
 598                 # Param names present
 599                 for _ in range(param_count):
 600                     u30()  # param name
 601             mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
 602             method_infos.append(mi)
 603
 604         # Metadata
 605         metadata_count = u30()
 606         for _c in range(metadata_count):
 607             u30()  # name
 608             item_count = u30()
 609             for _c2 in range(item_count):
 610                 u30()  # key
 611                 u30()  # value
 612
 613         def parse_traits_info():
 614             trait_name_idx = u30()
 615             kind_full = read_byte()
 616             kind = kind_full & 0x0f
 617             attrs = kind_full >> 4
 618             methods = {}
 619             if kind in [0x00, 0x06]:  # Slot or Const
 620                 u30()  # Slot id
 621                 u30()  # type_name_idx
 622                 vindex = u30()
 623                 if vindex != 0:
 624                     read_byte()  # vkind
 625             elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter
 626                 u30()  # disp_id
 627                 method_idx = u30()
 628                 methods[multinames[trait_name_idx]] = method_idx
 629             elif kind == 0x04:  # Class
 630                 u30()  # slot_id
 631                 u30()  # classi
 632             elif kind == 0x05:  # Function
 633                 u30()  # slot_id
 634                 function_idx = u30()
 635                 methods[function_idx] = multinames[trait_name_idx]
 636             else:
 637                 raise ExtractorError(u'Unsupported trait kind %d' % kind)
 638
 639             if attrs & 0x4 != 0:  # Metadata present
 640                 metadata_count = u30()
 641                 for _c3 in range(metadata_count):
 642                     u30()  # metadata index
 643
 644             return methods
 645
 646         # Classes
 647         TARGET_CLASSNAME = u'SignatureDecipher'
 648         searched_idx = multinames.index(TARGET_CLASSNAME)
 649         searched_class_id = None
 650         class_count = u30()
 651         for class_id in range(class_count):
 652             name_idx = u30()
 653             if name_idx == searched_idx:
 654                 # We found the class we're looking for!
 655                 searched_class_id = class_id
 656             u30()  # super_name idx
 657             flags = read_byte()
 658             if flags & 0x08 != 0:  # Protected namespace is present
 659                 u30()  # protected_ns_idx
 660             intrf_count = u30()
 661             for _c2 in range(intrf_count):
 662                 u30()
 663             u30()  # iinit
 664             trait_count = u30()
 665             for _c2 in range(trait_count):
 666                 parse_traits_info()
 667
 668         if searched_class_id is None:
 669             raise ExtractorError(u'Target class %r not found' %
 670                                  TARGET_CLASSNAME)
 671
 672         method_names = {}
 673         method_idxs = {}
 674         for class_id in range(class_count):
 675             u30()  # cinit
 676             trait_count = u30()
 677             for _c2 in range(trait_count):
 678                 trait_methods = parse_traits_info()
 679                 if class_id == searched_class_id:
 680                     method_names.update(trait_methods.items())
 681                     method_idxs.update(dict(
 682                         (idx, name)
 683                         for name, idx in trait_methods.items()))
 684
 685         # Scripts
 686         script_count = u30()
 687         for _c in range(script_count):
 688             u30()  # init
 689             trait_count = u30()
 690             for _c2 in range(trait_count):
 691                 parse_traits_info()
 692
 693         # Method bodies
 694         method_body_count = u30()
 695         Method = collections.namedtuple('Method', ['code', 'local_count'])
 696         methods = {}
 697         for _c in range(method_body_count):
 698             method_idx = u30()
 699             u30()  # max_stack
 700             local_count = u30()
 701             u30()  # init_scope_depth
 702             u30()  # max_scope_depth
 703             code_length = u30()
 704             code = read_bytes(code_length)
 705             if method_idx in method_idxs:
 706                 m = Method(code, local_count)
 707                 methods[method_idxs[method_idx]] = m
 708             exception_count = u30()
 709             for _c2 in range(exception_count):
 710                 u30()  # from
 711                 u30()  # to
 712                 u30()  # target
 713                 u30()  # exc_type
 714                 u30()  # var_name
 715             trait_count = u30()
 716             for _c2 in range(trait_count):
 717                 parse_traits_info()
 718
 719         assert p + code_reader.tell() == len(code_tag)
 720         assert len(methods) == len(method_idxs)
 721
 722         method_pyfunctions = {}
 723
 724         def extract_function(func_name):
 725             if func_name in method_pyfunctions:
 726                 return method_pyfunctions[func_name]
 727             if func_name not in methods:
 728                 raise ExtractorError(u'Cannot find function %r' % func_name)
 729             m = methods[func_name]
 730
 731             def resfunc(args):
 732                 registers = ['(this)'] + list(args) + [None] * m.local_count
 733                 stack = []
 734                 coder = io.BytesIO(m.code)
 735                 while True:
 736                     opcode = struct.unpack('!B', coder.read(1))[0]
 737                     if opcode == 36:  # pushbyte
 738                         v = struct.unpack('!B', coder.read(1))[0]
 739                         stack.append(v)
 740                     elif opcode == 44:  # pushstring
 741                         idx = u30(coder)
 742                         stack.append(constant_strings[idx])
 743                     elif opcode == 48:  # pushscope
 744                         # We don't implement the scope register, so we'll just
 745                         # ignore the popped value
 746                         stack.pop()
 747                     elif opcode == 70:  # callproperty
 748                         index = u30(coder)
 749                         mname = multinames[index]
 750                         arg_count = u30(coder)
 751                         args = list(reversed(
 752                             [stack.pop() for _ in range(arg_count)]))
 753                         obj = stack.pop()
 754                         if mname == u'split':
 755                             assert len(args) == 1
 756                             assert isinstance(args[0], compat_str)
 757                             assert isinstance(obj, compat_str)
 758                             if args[0] == u'':
 759                                 res = list(obj)
 760                             else:
 761                                 res = obj.split(args[0])
 762                             stack.append(res)
 763                         elif mname == u'slice':
 764                             assert len(args) == 1
 765                             assert isinstance(args[0], int)
 766                             assert isinstance(obj, list)
 767                             res = obj[args[0]:]
 768                             stack.append(res)
 769                         elif mname == u'join':
 770                             assert len(args) == 1
 771                             assert isinstance(args[0], compat_str)
 772                             assert isinstance(obj, list)
 773                             res = args[0].join(obj)
 774                             stack.append(res)
 775                         elif mname in method_pyfunctions:
 776                             stack.append(method_pyfunctions[mname](args))
 777                         else:
 778                             raise NotImplementedError(
 779                                 u'Unsupported property %r on %r'
 780                                 % (mname, obj))
 781                     elif opcode == 72:  # returnvalue
 782                         res = stack.pop()
 783                         return res
 784                     elif opcode == 79:  # callpropvoid
 785                         index = u30(coder)
 786                         mname = multinames[index]
 787                         arg_count = u30(coder)
 788                         args = list(reversed(
 789                             [stack.pop() for _ in range(arg_count)]))
 790                         obj = stack.pop()
 791                         if mname == u'reverse':
 792                             assert isinstance(obj, list)
 793                             obj.reverse()
 794                         else:
 795                             raise NotImplementedError(
 796                                 u'Unsupported (void) property %r on %r'
 797                                 % (mname, obj))
 798                     elif opcode == 93:  # findpropstrict
 799                         index = u30(coder)
 800                         mname = multinames[index]
 801                         res = extract_function(mname)
 802                         stack.append(res)
 803                     elif opcode == 97:  # setproperty
 804                         index = u30(coder)
 805                         value = stack.pop()
 806                         idx = stack.pop()
 807                         obj = stack.pop()
 808                         assert isinstance(obj, list)
 809                         assert isinstance(idx, int)
 810                         obj[idx] = value
 811                     elif opcode == 98:  # getlocal
 812                         index = u30(coder)
 813                         stack.append(registers[index])
 814                     elif opcode == 99:  # setlocal
 815                         index = u30(coder)
 816                         value = stack.pop()
 817                         registers[index] = value
 818                     elif opcode == 102:  # getproperty
 819                         index = u30(coder)
 820                         pname = multinames[index]
 821                         if pname == u'length':
 822                             obj = stack.pop()
 823                             assert isinstance(obj, list)
 824                             stack.append(len(obj))
 825                         else:  # Assume attribute access
 826                             idx = stack.pop()
 827                             assert isinstance(idx, int)
 828                             obj = stack.pop()
 829                             assert isinstance(obj, list)
 830                             stack.append(obj[idx])
 831                     elif opcode == 128:  # coerce
 832                         u30(coder)
 833                     elif opcode == 133:  # coerce_s
 834                         assert isinstance(stack[-1], (type(None), compat_str))
 835                     elif opcode == 164:  # modulo
 836                         value2 = stack.pop()
 837                         value1 = stack.pop()
 838                         res = value1 % value2
 839                         stack.append(res)
 840                     elif opcode == 208:  # getlocal_0
 841                         stack.append(registers[0])
 842                     elif opcode == 209:  # getlocal_1
 843                         stack.append(registers[1])
 844                     elif opcode == 210:  # getlocal_2
 845                         stack.append(registers[2])
 846                     elif opcode == 211:  # getlocal_3
 847                         stack.append(registers[3])
 848                     elif opcode == 214:  # setlocal_2
 849                         registers[2] = stack.pop()
 850                     elif opcode == 215:  # setlocal_3
 851                         registers[3] = stack.pop()
 852                     else:
 853                         raise NotImplementedError(
 854                             u'Unsupported opcode %d' % opcode)
 855
 856             method_pyfunctions[func_name] = resfunc
 857             return resfunc
 858
 859         initial_function = extract_function(u'decipher')
 860         return lambda s: initial_function([s])
 861
 862     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 863         """Turn the encrypted s field into a working signature"""
 864
 865         if player_url is not None:
 866             if player_url.startswith(u'//'):
 867                 player_url = u'https:' + player_url
 868             try:
 869                 player_id = (player_url, len(s))
 870                 if player_id not in self._player_cache:
 871                     func = self._extract_signature_function(
 872                         video_id, player_url, len(s)
 873                     )
 874                     self._player_cache[player_id] = func
 875                 func = self._player_cache[player_id]
 876                 if self._downloader.params.get('youtube_print_sig_code'):
 877                     self._print_sig_code(func, len(s))
 878                 return func(s)
 879             except Exception:
 880                 tb = traceback.format_exc()
 881                 self._downloader.report_warning(
 882                     u'Automatic signature extraction failed: ' + tb)
 883
 884             self._downloader.report_warning(
 885                 u'Warning: Falling back to static signature algorithm')
 886
 887         return self._static_decrypt_signature(
 888             s, video_id, player_url, age_gate)
 889
 890     def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
 891         if age_gate:
 892             # The videos with age protection use another player, so the
 893             # algorithms can be different.
 894             if len(s) == 86:
 895                 return s[2:63] + s[82] + s[64:82] + s[63]
 896
 897         if len(s) == 93:
 898             return s[86:29:-1] + s[88] + s[28:5:-1]
 899         elif len(s) == 92:
 900             return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
 901         elif len(s) == 91:
 902             return s[84:27:-1] + s[86] + s[26:5:-1]
 903         elif len(s) == 90:
 904             return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
 905         elif len(s) == 89:
 906             return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
 907         elif len(s) == 88:
 908             return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
 909         elif len(s) == 87:
 910             return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
 911         elif len(s) == 86:
 912             return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
 913         elif len(s) == 85:
 914             return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
 915         elif len(s) == 84:
 916             return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
 917         elif len(s) == 83:
 918             return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
 919         elif len(s) == 82:
 920             return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
 921         elif len(s) == 81:
 922             return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
 923         elif len(s) == 80:
 924             return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
 925         elif len(s) == 79:
 926             return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
 927
 928         else:
 929             raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
 930
 931     def _get_available_subtitles(self, video_id, webpage):
 932         try:
 933             sub_list = self._download_webpage(
 934                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 935                 video_id, note=False)
 936         except ExtractorError as err:
 937             self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 938             return {}
 939         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 940
 941         sub_lang_list = {}
 942         for l in lang_list:
 943             lang = l[1]
 944             params = compat_urllib_parse.urlencode({
 945                 'lang': lang,
 946                 'v': video_id,
 947                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 948                 'name': unescapeHTML(l[0]).encode('utf-8'),
 949             })
 950             url = u'https://www.youtube.com/api/timedtext?' + params
 951             sub_lang_list[lang] = url
 952         if not sub_lang_list:
 953             self._downloader.report_warning(u'video doesn\'t have subtitles')
 954             return {}
 955         return sub_lang_list
 956
 957     def _get_available_automatic_caption(self, video_id, webpage):
 958         """We need the webpage for getting the captions url, pass it as an
 959            argument to speed up the process."""
 960         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 961         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 962         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 963         err_msg = u'Couldn\'t find automatic captions for %s' % video_id
 964         if mobj is None:
 965             self._downloader.report_warning(err_msg)
 966             return {}
 967         player_config = json.loads(mobj.group(1))
 968         try:
 969             args = player_config[u'args']
 970             caption_url = args[u'ttsurl']
 971             timestamp = args[u'timestamp']
 972             # We get the available subtitles
 973             list_params = compat_urllib_parse.urlencode({
 974                 'type': 'list',
 975                 'tlangs': 1,
 976                 'asrs': 1,
 977             })
 978             list_url = caption_url + '&' + list_params
 979             caption_list = self._download_xml(list_url, video_id)
 980             original_lang_node = caption_list.find('track')
 981             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
 982                 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
 983                 return {}
 984             original_lang = original_lang_node.attrib['lang_code']
 985
 986             sub_lang_list = {}
 987             for lang_node in caption_list.findall('target'):
 988                 sub_lang = lang_node.attrib['lang_code']
 989                 params = compat_urllib_parse.urlencode({
 990                     'lang': original_lang,
 991                     'tlang': sub_lang,
 992                     'fmt': sub_format,
 993                     'ts': timestamp,
 994                     'kind': 'asr',
 995                 })
 996                 sub_lang_list[sub_lang] = caption_url + '&' + params
 997             return sub_lang_list
 998         # An extractor error can be raise by the download process if there are
 999         # no automatic captions but there are subtitles
1000         except (KeyError, ExtractorError):
1001             self._downloader.report_warning(err_msg)
1002             return {}
1003
1004     @classmethod
1005     def extract_id(cls, url):
1006         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1007         if mobj is None:
1008             raise ExtractorError(u'Invalid URL: %s' % url)
1009         video_id = mobj.group(2)
1010         return video_id
1011
1012     def _extract_from_m3u8(self, manifest_url, video_id):
1013         url_map = {}
1014         def _get_urls(_manifest):
1015             lines = _manifest.split('\n')
1016             urls = filter(lambda l: l and not l.startswith('#'),
1017                             lines)
1018             return urls
1019         manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1020         formats_urls = _get_urls(manifest)
1021         for format_url in formats_urls:
1022             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1023             url_map[itag] = format_url
1024         return url_map
1025
1026     def _extract_annotations(self, video_id):
1027         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1028         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1029
1030     def _real_extract(self, url):
1031         proto = (
1032             u'http' if self._downloader.params.get('prefer_insecure', False)
1033             else u'https')
1034
1035         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1036         mobj = re.search(self._NEXT_URL_RE, url)
1037         if mobj:
1038             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1039         video_id = self.extract_id(url)
1040
1041         # Get video webpage
1042         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1043         video_webpage = self._download_webpage(url, video_id)
1044
1045         # Attempt to extract SWF player URL
1046         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1047         if mobj is not None:
1048             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1049         else:
1050             player_url = None
1051
1052         # Get video info
1053         self.report_video_info_webpage_download(video_id)
1054         if re.search(r'player-age-gate-content">', video_webpage) is not None:
1055             self.report_age_confirmation()
1056             age_gate = True
1057             # We simulate the access to the video from www.youtube.com/v/{video_id}
1058             # this can be viewed without login into Youtube
1059             data = compat_urllib_parse.urlencode({'video_id': video_id,
1060                                                   'el': 'player_embedded',
1061                                                   'gl': 'US',
1062                                                   'hl': 'en',
1063                                                   'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1064                                                   'asv': 3,
1065                                                   'sts':'1588',
1066                                                   })
1067             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1068             video_info_webpage = self._download_webpage(video_info_url, video_id,
1069                                     note=False,
1070                                     errnote='unable to download video info webpage')
1071             video_info = compat_parse_qs(video_info_webpage)
1072         else:
1073             age_gate = False
1074             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1075                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1076                         % (video_id, el_type))
1077                 video_info_webpage = self._download_webpage(video_info_url, video_id,
1078                                         note=False,
1079                                         errnote='unable to download video info webpage')
1080                 video_info = compat_parse_qs(video_info_webpage)
1081                 if 'token' in video_info:
1082                     break
1083         if 'token' not in video_info:
1084             if 'reason' in video_info:
1085                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1086             else:
1087                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1088
1089         if 'view_count' in video_info:
1090             view_count = int(video_info['view_count'][0])
1091         else:
1092             view_count = None
1093
1094         # Check for "rental" videos
1095         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1096             raise ExtractorError(u'"rental" videos not supported')
1097
1098         # Start extracting information
1099         self.report_information_extraction(video_id)
1100
1101         # uploader
1102         if 'author' not in video_info:
1103             raise ExtractorError(u'Unable to extract uploader name')
1104         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1105
1106         # uploader_id
1107         video_uploader_id = None
1108         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1109         if mobj is not None:
1110             video_uploader_id = mobj.group(1)
1111         else:
1112             self._downloader.report_warning(u'unable to extract uploader nickname')
1113
1114         # title
1115         if 'title' in video_info:
1116             video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1117         else:
1118             self._downloader.report_warning(u'Unable to extract video title')
1119             video_title = u'_'
1120
1121         # thumbnail image
1122         # We try first to get a high quality image:
1123         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1124                             video_webpage, re.DOTALL)
1125         if m_thumb is not None:
1126             video_thumbnail = m_thumb.group(1)
1127         elif 'thumbnail_url' not in video_info:
1128             self._downloader.report_warning(u'unable to extract video thumbnail')
1129             video_thumbnail = None
1130         else:   # don't panic if we can't find it
1131             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1132
1133         # upload date
1134         upload_date = None
1135         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1136         if mobj is not None:
1137             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1138             upload_date = unified_strdate(upload_date)
1139
1140         # description
1141         video_description = get_element_by_id("eow-description", video_webpage)
1142         if video_description:
1143             video_description = re.sub(r'''(?x)
1144                 <a\s+
1145                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1146                     title="([^"]+)"\s+
1147                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1148                     class="yt-uix-redirect-link"\s*>
1149                 [^<]+
1150                 </a>
1151             ''', r'\1', video_description)
1152             video_description = clean_html(video_description)
1153         else:
1154             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1155             if fd_mobj:
1156                 video_description = unescapeHTML(fd_mobj.group(1))
1157             else:
1158                 video_description = u''
1159
1160         def _extract_count(klass):
1161             count = self._search_regex(
1162                 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1163                 video_webpage, klass, default=None)
1164             if count is not None:
1165                 return int(count.replace(',', ''))
1166             return None
1167         like_count = _extract_count(u'likes-count')
1168         dislike_count = _extract_count(u'dislikes-count')
1169
1170         # subtitles
1171         video_subtitles = self.extract_subtitles(video_id, video_webpage)
1172
1173         if self._downloader.params.get('listsubtitles', False):
1174             self._list_available_subtitles(video_id, video_webpage)
1175             return
1176
1177         if 'length_seconds' not in video_info:
1178             self._downloader.report_warning(u'unable to extract video duration')
1179             video_duration = None
1180         else:
1181             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1182
1183         # annotations
1184         video_annotations = None
1185         if self._downloader.params.get('writeannotations', False):
1186                 video_annotations = self._extract_annotations(video_id)
1187
1188         # Decide which formats to download
1189         try:
1190             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1191             if not mobj:
1192                 raise ValueError('Could not find vevo ID')
1193             json_code = uppercase_escape(mobj.group(1))
1194             ytplayer_config = json.loads(json_code)
1195             args = ytplayer_config['args']
1196             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1197             # this signatures are encrypted
1198             if 'url_encoded_fmt_stream_map' not in args:
1199                 raise ValueError(u'No stream_map present')  # caught below
1200             re_signature = re.compile(r'[&,]s=')
1201             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1202             if m_s is not None:
1203                 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1204                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1205             m_s = re_signature.search(args.get('adaptive_fmts', u''))
1206             if m_s is not None:
1207                 if 'adaptive_fmts' in video_info:
1208                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1209                 else:
1210                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1211         except ValueError:
1212             pass
1213
1214         def _map_to_format_list(urlmap):
1215             formats = []
1216             for itag, video_real_url in urlmap.items():
1217                 dct = {
1218                     'format_id': itag,
1219                     'url': video_real_url,
1220                     'player_url': player_url,
1221                 }
1222                 if itag in self._formats:
1223                     dct.update(self._formats[itag])
1224                 formats.append(dct)
1225             return formats
1226
1227         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1228             self.report_rtmp_download()
1229             formats = [{
1230                 'format_id': '_rtmp',
1231                 'protocol': 'rtmp',
1232                 'url': video_info['conn'][0],
1233                 'player_url': player_url,
1234             }]
1235         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1236             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1237             if 'rtmpe%3Dyes' in encoded_url_map:
1238                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1239             url_map = {}
1240             for url_data_str in encoded_url_map.split(','):
1241                 url_data = compat_parse_qs(url_data_str)
1242                 if 'itag' in url_data and 'url' in url_data:
1243                     url = url_data['url'][0]
1244                     if 'sig' in url_data:
1245                         url += '&signature=' + url_data['sig'][0]
1246                     elif 's' in url_data:
1247                         encrypted_sig = url_data['s'][0]
1248                         if self._downloader.params.get('verbose'):
1249                             if age_gate:
1250                                 if player_url is None:
1251                                     player_version = 'unknown'
1252                                 else:
1253                                     player_version = self._search_regex(
1254                                         r'-(.+)\.swf$', player_url,
1255                                         u'flash player', fatal=False)
1256                                 player_desc = 'flash player %s' % player_version
1257                             else:
1258                                 player_version = self._search_regex(
1259                                     r'html5player-(.+?)\.js', video_webpage,
1260                                     'html5 player', fatal=False)
1261                                 player_desc = u'html5 player %s' % player_version
1262
1263                             parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1264                             self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1265                                 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1266
1267                         if not age_gate:
1268                             jsplayer_url_json = self._search_regex(
1269                                 r'"assets":.+?"js":\s*("[^"]+")',
1270                                 video_webpage, u'JS player URL')
1271                             player_url = json.loads(jsplayer_url_json)
1272
1273                         signature = self._decrypt_signature(
1274                             encrypted_sig, video_id, player_url, age_gate)
1275                         url += '&signature=' + signature
1276                     if 'ratebypass' not in url:
1277                         url += '&ratebypass=yes'
1278                     url_map[url_data['itag'][0]] = url
1279             formats = _map_to_format_list(url_map)
1280         elif video_info.get('hlsvp'):
1281             manifest_url = video_info['hlsvp'][0]
1282             url_map = self._extract_from_m3u8(manifest_url, video_id)
1283             formats = _map_to_format_list(url_map)
1284         else:
1285             raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1286
1287         # Look for the DASH manifest
1288         if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1289             try:
1290                 # The DASH manifest used needs to be the one from the original video_webpage.
1291                 # The one found in get_video_info seems to be using different signatures.
1292                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1293                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1294                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1295                 if age_gate:
1296                     dash_manifest_url = video_info.get('dashmpd')[0]
1297                 else:
1298                     dash_manifest_url = ytplayer_config['args']['dashmpd']
1299                 def decrypt_sig(mobj):
1300                     s = mobj.group(1)
1301                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1302                     return '/signature/%s' % dec_s
1303                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1304                 dash_doc = self._download_xml(
1305                     dash_manifest_url, video_id,
1306                     note=u'Downloading DASH manifest',
1307                     errnote=u'Could not download DASH manifest')
1308                 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1309                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1310                     if url_el is None:
1311                         continue
1312                     format_id = r.attrib['id']
1313                     video_url = url_el.text
1314                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1315                     f = {
1316                         'format_id': format_id,
1317                         'url': video_url,
1318                         'width': int_or_none(r.attrib.get('width')),
1319                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1320                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1321                         'filesize': filesize,
1322                     }
1323                     try:
1324                         existing_format = next(
1325                             fo for fo in formats
1326                             if fo['format_id'] == format_id)
1327                     except StopIteration:
1328                         f.update(self._formats.get(format_id, {}))
1329                         formats.append(f)
1330                     else:
1331                         existing_format.update(f)
1332
1333             except (ExtractorError, KeyError) as e:
1334                 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1335
1336         self._sort_formats(formats)
1337
1338         return {
1339             'id':           video_id,
1340             'uploader':     video_uploader,
1341             'uploader_id':  video_uploader_id,
1342             'upload_date':  upload_date,
1343             'title':        video_title,
1344             'thumbnail':    video_thumbnail,
1345             'description':  video_description,
1346             'subtitles':    video_subtitles,
1347             'duration':     video_duration,
1348             'age_limit':    18 if age_gate else 0,
1349             'annotations':  video_annotations,
1350             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1351             'view_count':   view_count,
1352             'like_count': like_count,
1353             'dislike_count': dislike_count,
1354             'formats':      formats,
1355         }
1356
1357 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1358     IE_DESC = u'YouTube.com playlists'
1359     _VALID_URL = r"""(?x)(?:
1360                         (?:https?://)?
1361                         (?:\w+\.)?
1362                         youtube\.com/
1363                         (?:
1364                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1365                            \? (?:.*?&)*? (?:p|a|list)=
1366                         |  p/
1367                         )
1368                         (
1369                             (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1370                             # Top tracks, they can also include dots
1371                             |(?:MC)[\w\.]*
1372                         )
1373                         .*
1374                      |
1375                         ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1376                      )"""
1377     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1378     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1379     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1380     IE_NAME = u'youtube:playlist'
1381
1382     def _real_initialize(self):
1383         self._login()
1384
1385     def _ids_to_results(self, ids):
1386         return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1387                        for vid_id in ids]
1388
1389     def _extract_mix(self, playlist_id):
1390         # The mixes are generated from a a single video
1391         # the id of the playlist is just 'RD' + video_id
1392         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1393         webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1394         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1395         title_span = (search_title('playlist-title') or
1396             search_title('title long-title') or search_title('title'))
1397         title = clean_html(title_span)
1398         video_re = r'''(?x)data-video-username="(.*?)".*?
1399                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1400         matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1401         # Some of the videos may have been deleted, their username field is empty
1402         ids = [video_id for (username, video_id) in matches if username]
1403         url_results = self._ids_to_results(ids)
1404
1405         return self.playlist_result(url_results, playlist_id, title)
1406
1407     def _real_extract(self, url):
1408         # Extract playlist id
1409         mobj = re.match(self._VALID_URL, url)
1410         if mobj is None:
1411             raise ExtractorError(u'Invalid URL: %s' % url)
1412         playlist_id = mobj.group(1) or mobj.group(2)
1413
1414         # Check if it's a video-specific URL
1415         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1416         if 'v' in query_dict:
1417             video_id = query_dict['v'][0]
1418             if self._downloader.params.get('noplaylist'):
1419                 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1420                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1421             else:
1422                 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1423
1424         if playlist_id.startswith('RD'):
1425             # Mixes require a custom extraction process
1426             return self._extract_mix(playlist_id)
1427         if playlist_id.startswith('TL'):
1428             raise ExtractorError(u'For downloading YouTube.com top lists, use '
1429                 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1430
1431         url = self._TEMPLATE_URL % playlist_id
1432         page = self._download_webpage(url, playlist_id)
1433         more_widget_html = content_html = page
1434
1435         # Extract the video ids from the playlist pages
1436         ids = []
1437
1438         for page_num in itertools.count(1):
1439             matches = re.finditer(self._VIDEO_RE, content_html)
1440             # We remove the duplicates and the link with index 0
1441             # (it's not the first video of the playlist)
1442             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1443             ids.extend(new_ids)
1444
1445             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1446             if not mobj:
1447                 break
1448
1449             more = self._download_json(
1450                 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num)
1451             content_html = more['content_html']
1452             more_widget_html = more['load_more_widget_html']
1453
1454         playlist_title = self._html_search_regex(
1455                 r'<h1 class="pl-header-title">\s*(.*?)\s*</h1>', page, u'title')
1456
1457         url_results = self._ids_to_results(ids)
1458         return self.playlist_result(url_results, playlist_id, playlist_title)
1459
1460
1461 class YoutubeTopListIE(YoutubePlaylistIE):
1462     IE_NAME = u'youtube:toplist'
1463     IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1464         u' (Example: "yttoplist:music:Top Tracks")')
1465     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1466
1467     def _real_extract(self, url):
1468         mobj = re.match(self._VALID_URL, url)
1469         channel = mobj.group('chann')
1470         title = mobj.group('title')
1471         query = compat_urllib_parse.urlencode({'title': title})
1472         playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1473         channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1474         link = self._html_search_regex(playlist_re, channel_page, u'list')
1475         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1476
1477         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1478         ids = []
1479         # sometimes the webpage doesn't contain the videos
1480         # retry until we get them
1481         for i in itertools.count(0):
1482             msg = u'Downloading Youtube mix'
1483             if i > 0:
1484                 msg += ', retry #%d' % i
1485             webpage = self._download_webpage(url, title, msg)
1486             ids = orderedSet(re.findall(video_re, webpage))
1487             if ids:
1488                 break
1489         url_results = self._ids_to_results(ids)
1490         return self.playlist_result(url_results, playlist_title=title)
1491
1492
1493 class YoutubeChannelIE(InfoExtractor):
1494     IE_DESC = u'YouTube.com channels'
1495     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1496     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1497     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1498     IE_NAME = u'youtube:channel'
1499
1500     def extract_videos_from_page(self, page):
1501         ids_in_page = []
1502         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1503             if mobj.group(1) not in ids_in_page:
1504                 ids_in_page.append(mobj.group(1))
1505         return ids_in_page
1506
1507     def _real_extract(self, url):
1508         # Extract channel id
1509         mobj = re.match(self._VALID_URL, url)
1510         if mobj is None:
1511             raise ExtractorError(u'Invalid URL: %s' % url)
1512
1513         # Download channel page
1514         channel_id = mobj.group(1)
1515         video_ids = []
1516         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1517         channel_page = self._download_webpage(url, channel_id)
1518         autogenerated = re.search(r'''(?x)
1519                 class="[^"]*?(?:
1520                     channel-header-autogenerated-label|
1521                     yt-channel-title-autogenerated
1522                 )[^"]*"''', channel_page) is not None
1523
1524         if autogenerated:
1525             # The videos are contained in a single page
1526             # the ajax pages can't be used, they are empty
1527             video_ids = self.extract_videos_from_page(channel_page)
1528         else:
1529             # Download all channel pages using the json-based channel_ajax query
1530             for pagenum in itertools.count(1):
1531                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1532                 page = self._download_json(
1533                     url, channel_id, note=u'Downloading page #%s' % pagenum,
1534                     transform_source=uppercase_escape)
1535
1536                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1537                 video_ids.extend(ids_in_page)
1538
1539                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1540                     break
1541
1542         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1543
1544         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1545                        for video_id in video_ids]
1546         return self.playlist_result(url_entries, channel_id)
1547
1548
1549 class YoutubeUserIE(InfoExtractor):
1550     IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1551     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1552     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1553     _GDATA_PAGE_SIZE = 50
1554     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1555     IE_NAME = u'youtube:user'
1556
1557     @classmethod
1558     def suitable(cls, url):
1559         # Don't return True if the url can be extracted with other youtube
1560         # extractor, the regex would is too permissive and it would match.
1561         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1562         if any(ie.suitable(url) for ie in other_ies): return False
1563         else: return super(YoutubeUserIE, cls).suitable(url)
1564
1565     def _real_extract(self, url):
1566         # Extract username
1567         mobj = re.match(self._VALID_URL, url)
1568         if mobj is None:
1569             raise ExtractorError(u'Invalid URL: %s' % url)
1570
1571         username = mobj.group(1)
1572
1573         # Download video ids using YouTube Data API. Result size per
1574         # query is limited (currently to 50 videos) so we need to query
1575         # page by page until there are no video ids - it means we got
1576         # all of them.
1577
1578         def download_page(pagenum):
1579             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1580
1581             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1582             page = self._download_webpage(
1583                 gdata_url, username,
1584                 u'Downloading video ids from %d to %d' % (
1585                     start_index, start_index + self._GDATA_PAGE_SIZE))
1586
1587             try:
1588                 response = json.loads(page)
1589             except ValueError as err:
1590                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1591             if 'entry' not in response['feed']:
1592                 return
1593
1594             # Extract video identifiers
1595             entries = response['feed']['entry']
1596             for entry in entries:
1597                 title = entry['title']['$t']
1598                 video_id = entry['id']['$t'].split('/')[-1]
1599                 yield {
1600                     '_type': 'url',
1601                     'url': video_id,
1602                     'ie_key': 'Youtube',
1603                     'id': video_id,
1604                     'title': title,
1605                 }
1606         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1607
1608         return self.playlist_result(url_results, playlist_title=username)
1609
1610
1611 class YoutubeSearchIE(SearchInfoExtractor):
1612     IE_DESC = u'YouTube.com searches'
1613     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1614     _MAX_RESULTS = 1000
1615     IE_NAME = u'youtube:search'
1616     _SEARCH_KEY = 'ytsearch'
1617
1618     def _get_n_results(self, query, n):
1619         """Get a specified number of results for a query"""
1620
1621         video_ids = []
1622         pagenum = 0
1623         limit = n
1624
1625         while (50 * pagenum) < limit:
1626             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1627             data_json = self._download_webpage(
1628                 result_url, video_id=u'query "%s"' % query,
1629                 note=u'Downloading page %s' % (pagenum + 1),
1630                 errnote=u'Unable to download API page')
1631             data = json.loads(data_json)
1632             api_response = data['data']
1633
1634             if 'items' not in api_response:
1635                 raise ExtractorError(
1636                     u'[youtube] No video results', expected=True)
1637
1638             new_ids = list(video['id'] for video in api_response['items'])
1639             video_ids += new_ids
1640
1641             limit = min(n, api_response['totalItems'])
1642             pagenum += 1
1643
1644         if len(video_ids) > n:
1645             video_ids = video_ids[:n]
1646         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1647                   for video_id in video_ids]
1648         return self.playlist_result(videos, query)
1649
1650
1651 class YoutubeSearchDateIE(YoutubeSearchIE):
1652     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1653     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1654     _SEARCH_KEY = 'ytsearchdate'
1655     IE_DESC = u'YouTube.com searches, newest videos first'
1656
1657
1658 class YoutubeSearchURLIE(InfoExtractor):
1659     IE_DESC = u'YouTube.com search URLs'
1660     IE_NAME = u'youtube:search_url'
1661     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1662
1663     def _real_extract(self, url):
1664         mobj = re.match(self._VALID_URL, url)
1665         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1666
1667         webpage = self._download_webpage(url, query)
1668         result_code = self._search_regex(
1669             r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
1670
1671         part_codes = re.findall(
1672             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1673         entries = []
1674         for part_code in part_codes:
1675             part_title = self._html_search_regex(
1676                 r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
1677             part_url_snippet = self._html_search_regex(
1678                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1679             part_url = compat_urlparse.urljoin(
1680                 'https://www.youtube.com/', part_url_snippet)
1681             entries.append({
1682                 '_type': 'url',
1683                 'url': part_url,
1684                 'title': part_title,
1685             })
1686
1687         return {
1688             '_type': 'playlist',
1689             'entries': entries,
1690             'title': query,
1691         }
1692
1693
1694 class YoutubeShowIE(InfoExtractor):
1695     IE_DESC = u'YouTube.com (multi-season) shows'
1696     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1697     IE_NAME = u'youtube:show'
1698
1699     def _real_extract(self, url):
1700         mobj = re.match(self._VALID_URL, url)
1701         show_name = mobj.group(1)
1702         webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1703         # There's one playlist for each season of the show
1704         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1705         self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1706         return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1707
1708
1709 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1710     """
1711     Base class for extractors that fetch info from
1712     http://www.youtube.com/feed_ajax
1713     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1714     """
1715     _LOGIN_REQUIRED = True
1716     # use action_load_personal_feed instead of action_load_system_feed
1717     _PERSONAL_FEED = False
1718
1719     @property
1720     def _FEED_TEMPLATE(self):
1721         action = 'action_load_system_feed'
1722         if self._PERSONAL_FEED:
1723             action = 'action_load_personal_feed'
1724         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1725
1726     @property
1727     def IE_NAME(self):
1728         return u'youtube:%s' % self._FEED_NAME
1729
1730     def _real_initialize(self):
1731         self._login()
1732
1733     def _real_extract(self, url):
1734         feed_entries = []
1735         paging = 0
1736         for i in itertools.count(1):
1737             info = self._download_webpage(self._FEED_TEMPLATE % paging,
1738                                           u'%s feed' % self._FEED_NAME,
1739                                           u'Downloading page %s' % i)
1740             info = json.loads(info)
1741             feed_html = info['feed_html']
1742             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1743             ids = orderedSet(m.group(1) for m in m_ids)
1744             feed_entries.extend(
1745                 self.url_result(video_id, 'Youtube', video_id=video_id)
1746                 for video_id in ids)
1747             if info['paging'] is None:
1748                 break
1749             paging = info['paging']
1750         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1751
1752 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1753     IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1754     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1755     _FEED_NAME = 'subscriptions'
1756     _PLAYLIST_TITLE = u'Youtube Subscriptions'
1757
1758 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1759     IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1760     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1761     _FEED_NAME = 'recommended'
1762     _PLAYLIST_TITLE = u'Youtube Recommended videos'
1763
1764 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1765     IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1766     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1767     _FEED_NAME = 'watch_later'
1768     _PLAYLIST_TITLE = u'Youtube Watch Later'
1769     _PERSONAL_FEED = True
1770
1771 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1772     IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1773     _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1774     _FEED_NAME = 'history'
1775     _PERSONAL_FEED = True
1776     _PLAYLIST_TITLE = u'Youtube Watch History'
1777
1778 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1779     IE_NAME = u'youtube:favorites'
1780     IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1781     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1782     _LOGIN_REQUIRED = True
1783
1784     def _real_extract(self, url):
1785         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1786         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1787         return self.url_result(playlist_id, 'YoutubePlaylist')
1788
1789
1790 class YoutubeTruncatedURLIE(InfoExtractor):
1791     IE_NAME = 'youtube:truncated_url'
1792     IE_DESC = False  # Do not list
1793     _VALID_URL = r'''(?x)
1794         (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
1795         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1796     '''
1797
1798     def _real_extract(self, url):
1799         raise ExtractorError(
1800             u'Did you forget to quote the URL? Remember that & is a meta '
1801             u'character in most shells, so you want to put the URL in quotes, '
1802             u'like  youtube-dl '
1803             u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1804             u' or simply  youtube-dl BaW_jenozKc  .',
1805             expected=True)