_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 import collections
   4 import errno
   5 import io
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import struct
  11 import traceback
  12 import zlib
  13
  14 from .common import InfoExtractor, SearchInfoExtractor
  15 from .subtitles import SubtitlesInfoExtractor
  16 from ..jsinterp import JSInterpreter
  17 from ..utils import (
  18     compat_chr,
  19     compat_parse_qs,
  20     compat_urllib_parse,
  21     compat_urllib_request,
  22     compat_urlparse,
  23     compat_str,
  24
  25     clean_html,
  26     get_cachedir,
  27     get_element_by_id,
  28     get_element_by_attribute,
  29     ExtractorError,
  30     int_or_none,
  31     PagedList,
  32     unescapeHTML,
  33     unified_strdate,
  34     orderedSet,
  35     write_json_file,
  36     uppercase_escape,
  37 )
  38
  39 class YoutubeBaseInfoExtractor(InfoExtractor):
  40     """Provide base functions for Youtube extractors"""
  41     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  42     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  43     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  44     _NETRC_MACHINE = 'youtube'
  45     # If True it will raise an error if no login info is provided
  46     _LOGIN_REQUIRED = False
  47
  48     def _set_language(self):
  49         return bool(self._download_webpage(
  50             self._LANG_URL, None,
  51             note=u'Setting language', errnote='unable to set language',
  52             fatal=False))
  53
  54     def _login(self):
  55         (username, password) = self._get_login_info()
  56         # No authentication to be performed
  57         if username is None:
  58             if self._LOGIN_REQUIRED:
  59                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  60             return False
  61
  62         login_page = self._download_webpage(
  63             self._LOGIN_URL, None,
  64             note=u'Downloading login page',
  65             errnote=u'unable to fetch login page', fatal=False)
  66         if login_page is False:
  67             return
  68
  69         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  70                                   login_page, u'Login GALX parameter')
  71
  72         # Log in
  73         login_form_strs = {
  74                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  75                 u'Email': username,
  76                 u'GALX': galx,
  77                 u'Passwd': password,
  78                 u'PersistentCookie': u'yes',
  79                 u'_utf8': u'霱',
  80                 u'bgresponse': u'js_disabled',
  81                 u'checkConnection': u'',
  82                 u'checkedDomains': u'youtube',
  83                 u'dnConn': u'',
  84                 u'pstMsg': u'0',
  85                 u'rmShown': u'1',
  86                 u'secTok': u'',
  87                 u'signIn': u'Sign in',
  88                 u'timeStmp': u'',
  89                 u'service': u'youtube',
  90                 u'uilel': u'3',
  91                 u'hl': u'en_US',
  92         }
  93         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
  94         # chokes on unicode
  95         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
  96         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
  97
  98         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
  99         login_results = self._download_webpage(
 100             req, None,
 101             note=u'Logging in', errnote=u'unable to log in', fatal=False)
 102         if login_results is False:
 103             return False
 104         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 105             self._downloader.report_warning(u'unable to log in: bad username or password')
 106             return False
 107         return True
 108
 109     def _confirm_age(self):
 110         age_form = {
 111             'next_url': '/',
 112             'action_confirm': 'Confirm',
 113         }
 114         req = compat_urllib_request.Request(self._AGE_URL,
 115             compat_urllib_parse.urlencode(age_form).encode('ascii'))
 116
 117         self._download_webpage(
 118             req, None,
 119             note=u'Confirming age', errnote=u'Unable to confirm age')
 120         return True
 121
 122     def _real_initialize(self):
 123         if self._downloader is None:
 124             return
 125         if not self._set_language():
 126             return
 127         if not self._login():
 128             return
 129         self._confirm_age()
 130
 131
 132 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 133     IE_DESC = u'YouTube.com'
 134     _VALID_URL = r"""(?x)^
 135                      (
 136                          (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
 137                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 138                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 139                             (?:www\.)?pwnyoutube\.com/|
 140                             (?:www\.)?yourepeat\.com/|
 141                             tube\.majestyc\.net/|
 142                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 143                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 144                          (?:                                                  # the various things that can precede the ID:
 145                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 146                              |(?:                                             # or the v= param in all its forms
 147                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 148                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 149                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 150                                  v=
 151                              )
 152                          ))
 153                          |youtu\.be/                                          # just youtu.be/xxxx
 154                          |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 155                          )
 156                      )?                                                       # all until now is optional -> you can pass the naked ID
 157                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 158                      (?(1).+)?                                                # if we found the ID, everything can follow
 159                      $"""
 160     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 161     _formats = {
 162         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 163         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 164         '13': {'ext': '3gp'},
 165         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 166         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 167         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 168         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 169         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 170         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 171         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 172         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 173         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 174         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 175         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 176         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 177
 178
 179         # 3d videos
 180         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 181         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 182         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 183         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 184         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 185         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 186         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 187
 188         # Apple HTTP Live Streaming
 189         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 190         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 191         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 192         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 193         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 194         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 195         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 196
 197         # DASH mp4 video
 198         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 199         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 200         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 201         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 202         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 203         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 204         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 205         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 206
 207         # Dash mp4 audio
 208         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 209         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 210         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 211
 212         # Dash webm
 213         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 214         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 215         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 216         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 217         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 218         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 219         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 220         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 221         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 222         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 223         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 224         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 225         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 226         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 227         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 228
 229         # Dash webm audio
 230         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
 231         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 232
 233         # RTMP (unnamed)
 234         '_rtmp': {'protocol': 'rtmp'},
 235     }
 236
 237     IE_NAME = u'youtube'
 238     _TESTS = [
 239         {
 240             u"url":  u"http://www.youtube.com/watch?v=BaW_jenozKc",
 241             u"file":  u"BaW_jenozKc.mp4",
 242             u"info_dict": {
 243                 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
 244                 u"uploader": u"Philipp Hagemeister",
 245                 u"uploader_id": u"phihag",
 246                 u"upload_date": u"20121002",
 247                 u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
 248                 u"categories": [u'Science & Technology'],
 249             }
 250         },
 251         {
 252             u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
 253             u"file":  u"UxxajLWwzqY.mp4",
 254             u"note": u"Test generic use_cipher_signature video (#897)",
 255             u"info_dict": {
 256                 u"upload_date": u"20120506",
 257                 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
 258                 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
 259                 u"uploader": u"Icona Pop",
 260                 u"uploader_id": u"IconaPop"
 261             }
 262         },
 263         {
 264             u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
 265             u"file":  u"07FYdnEawAQ.mp4",
 266             u"note": u"Test VEVO video with age protection (#956)",
 267             u"info_dict": {
 268                 u"upload_date": u"20130703",
 269                 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
 270                 u"description": u"md5:64249768eec3bc4276236606ea996373",
 271                 u"uploader": u"justintimberlakeVEVO",
 272                 u"uploader_id": u"justintimberlakeVEVO"
 273             }
 274         },
 275         {
 276             u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
 277             u"file":  u"yZIXLfi8CZQ.mp4",
 278             u"note": u"Embed-only video (#1746)",
 279             u"info_dict": {
 280                 u"upload_date": u"20120608",
 281                 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
 282                 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
 283                 u"uploader": u"SET India",
 284                 u"uploader_id": u"setindia"
 285             }
 286         },
 287         {
 288             u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
 289             u"file": u"a9LDPn-MO4I.m4a",
 290             u"note": u"256k DASH audio (format 141) via DASH manifest",
 291             u"info_dict": {
 292                 u"upload_date": "20121002",
 293                 u"uploader_id": "8KVIDEO",
 294                 u"description": "No description available.",
 295                 u"uploader": "8KVIDEO",
 296                 u"title": "UHDTV TEST 8K VIDEO.mp4"
 297             },
 298             u"params": {
 299                 u"youtube_include_dash_manifest": True,
 300                 u"format": "141",
 301             },
 302         },
 303         # DASH manifest with encrypted signature
 304         {
 305             u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 306             u'info_dict': {
 307                 u'id': u'IB3lcPjvWLA',
 308                 u'ext': u'm4a',
 309                 u'title': u'Afrojack - The Spark ft. Spree Wilson',
 310                 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
 311                 u'uploader': u'AfrojackVEVO',
 312                 u'uploader_id': u'AfrojackVEVO',
 313                 u'upload_date': u'20131011',
 314             },
 315             u"params": {
 316                 u'youtube_include_dash_manifest': True,
 317                 u'format': '141',
 318             },
 319         },
 320     ]
 321
 322
 323     @classmethod
 324     def suitable(cls, url):
 325         """Receives a URL and returns True if suitable for this IE."""
 326         if YoutubePlaylistIE.suitable(url): return False
 327         return re.match(cls._VALID_URL, url) is not None
 328
 329     def __init__(self, *args, **kwargs):
 330         super(YoutubeIE, self).__init__(*args, **kwargs)
 331         self._player_cache = {}
 332
 333     def report_video_info_webpage_download(self, video_id):
 334         """Report attempt to download video info webpage."""
 335         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 336
 337     def report_information_extraction(self, video_id):
 338         """Report attempt to extract video information."""
 339         self.to_screen(u'%s: Extracting video information' % video_id)
 340
 341     def report_unavailable_format(self, video_id, format):
 342         """Report extracted video URL."""
 343         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 344
 345     def report_rtmp_download(self):
 346         """Indicate the download will use the RTMP protocol."""
 347         self.to_screen(u'RTMP download detected')
 348
 349     def _extract_signature_function(self, video_id, player_url, slen):
 350         id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
 351                         player_url)
 352         player_type = id_m.group('ext')
 353         player_id = id_m.group('id')
 354
 355         # Read from filesystem cache
 356         func_id = '%s_%s_%d' % (player_type, player_id, slen)
 357         assert os.path.basename(func_id) == func_id
 358         cache_dir = get_cachedir(self._downloader.params)
 359
 360         cache_enabled = cache_dir is not None
 361         if cache_enabled:
 362             cache_fn = os.path.join(os.path.expanduser(cache_dir),
 363                                     u'youtube-sigfuncs',
 364                                     func_id + '.json')
 365             try:
 366                 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
 367                     cache_spec = json.load(cachef)
 368                 return lambda s: u''.join(s[i] for i in cache_spec)
 369             except IOError:
 370                 pass  # No cache available
 371
 372         if player_type == 'js':
 373             code = self._download_webpage(
 374                 player_url, video_id,
 375                 note=u'Downloading %s player %s' % (player_type, player_id),
 376                 errnote=u'Download of %s failed' % player_url)
 377             res = self._parse_sig_js(code)
 378         elif player_type == 'swf':
 379             urlh = self._request_webpage(
 380                 player_url, video_id,
 381                 note=u'Downloading %s player %s' % (player_type, player_id),
 382                 errnote=u'Download of %s failed' % player_url)
 383             code = urlh.read()
 384             res = self._parse_sig_swf(code)
 385         else:
 386             assert False, 'Invalid player type %r' % player_type
 387
 388         if cache_enabled:
 389             try:
 390                 test_string = u''.join(map(compat_chr, range(slen)))
 391                 cache_res = res(test_string)
 392                 cache_spec = [ord(c) for c in cache_res]
 393                 try:
 394                     os.makedirs(os.path.dirname(cache_fn))
 395                 except OSError as ose:
 396                     if ose.errno != errno.EEXIST:
 397                         raise
 398                 write_json_file(cache_spec, cache_fn)
 399             except Exception:
 400                 tb = traceback.format_exc()
 401                 self._downloader.report_warning(
 402                     u'Writing cache to %r failed: %s' % (cache_fn, tb))
 403
 404         return res
 405
 406     def _print_sig_code(self, func, slen):
 407         def gen_sig_code(idxs):
 408             def _genslice(start, end, step):
 409                 starts = u'' if start == 0 else str(start)
 410                 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
 411                 steps = u'' if step == 1 else (u':%d' % step)
 412                 return u's[%s%s%s]' % (starts, ends, steps)
 413
 414             step = None
 415             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 416                                     # set as soon as step is set
 417             for i, prev in zip(idxs[1:], idxs[:-1]):
 418                 if step is not None:
 419                     if i - prev == step:
 420                         continue
 421                     yield _genslice(start, prev, step)
 422                     step = None
 423                     continue
 424                 if i - prev in [-1, 1]:
 425                     step = i - prev
 426                     start = prev
 427                     continue
 428                 else:
 429                     yield u's[%d]' % prev
 430             if step is None:
 431                 yield u's[%d]' % i
 432             else:
 433                 yield _genslice(start, i, step)
 434
 435         test_string = u''.join(map(compat_chr, range(slen)))
 436         cache_res = func(test_string)
 437         cache_spec = [ord(c) for c in cache_res]
 438         expr_code = u' + '.join(gen_sig_code(cache_spec))
 439         code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code)
 440         self.to_screen(u'Extracted signature function:\n' + code)
 441
 442     def _parse_sig_js(self, jscode):
 443         funcname = self._search_regex(
 444             r'signature=([$a-zA-Z]+)', jscode,
 445              u'Initial JS player signature function name')
 446
 447         jsi = JSInterpreter(jscode)
 448         initial_function = jsi.extract_function(funcname)
 449         return lambda s: initial_function([s])
 450
 451     def _parse_sig_swf(self, file_contents):
 452         if file_contents[1:3] != b'WS':
 453             raise ExtractorError(
 454                 u'Not an SWF file; header is %r' % file_contents[:3])
 455         if file_contents[:1] == b'C':
 456             content = zlib.decompress(file_contents[8:])
 457         else:
 458             raise NotImplementedError(u'Unsupported compression format %r' %
 459                                       file_contents[:1])
 460
 461         def extract_tags(content):
 462             pos = 0
 463             while pos < len(content):
 464                 header16 = struct.unpack('<H', content[pos:pos+2])[0]
 465                 pos += 2
 466                 tag_code = header16 >> 6
 467                 tag_len = header16 & 0x3f
 468                 if tag_len == 0x3f:
 469                     tag_len = struct.unpack('<I', content[pos:pos+4])[0]
 470                     pos += 4
 471                 assert pos+tag_len <= len(content)
 472                 yield (tag_code, content[pos:pos+tag_len])
 473                 pos += tag_len
 474
 475         code_tag = next(tag
 476                         for tag_code, tag in extract_tags(content)
 477                         if tag_code == 82)
 478         p = code_tag.index(b'\0', 4) + 1
 479         code_reader = io.BytesIO(code_tag[p:])
 480
 481         # Parse ABC (AVM2 ByteCode)
 482         def read_int(reader=None):
 483             if reader is None:
 484                 reader = code_reader
 485             res = 0
 486             shift = 0
 487             for _ in range(5):
 488                 buf = reader.read(1)
 489                 assert len(buf) == 1
 490                 b = struct.unpack('<B', buf)[0]
 491                 res = res | ((b & 0x7f) << shift)
 492                 if b & 0x80 == 0:
 493                     break
 494                 shift += 7
 495             return res
 496
 497         def u30(reader=None):
 498             res = read_int(reader)
 499             assert res & 0xf0000000 == 0
 500             return res
 501         u32 = read_int
 502
 503         def s32(reader=None):
 504             v = read_int(reader)
 505             if v & 0x80000000 != 0:
 506                 v = - ((v ^ 0xffffffff) + 1)
 507             return v
 508
 509         def read_string(reader=None):
 510             if reader is None:
 511                 reader = code_reader
 512             slen = u30(reader)
 513             resb = reader.read(slen)
 514             assert len(resb) == slen
 515             return resb.decode('utf-8')
 516
 517         def read_bytes(count, reader=None):
 518             if reader is None:
 519                 reader = code_reader
 520             resb = reader.read(count)
 521             assert len(resb) == count
 522             return resb
 523
 524         def read_byte(reader=None):
 525             resb = read_bytes(1, reader=reader)
 526             res = struct.unpack('<B', resb)[0]
 527             return res
 528
 529         # minor_version + major_version
 530         read_bytes(2 + 2)
 531
 532         # Constant pool
 533         int_count = u30()
 534         for _c in range(1, int_count):
 535             s32()
 536         uint_count = u30()
 537         for _c in range(1, uint_count):
 538             u32()
 539         double_count = u30()
 540         read_bytes((double_count-1) * 8)
 541         string_count = u30()
 542         constant_strings = [u'']
 543         for _c in range(1, string_count):
 544             s = read_string()
 545             constant_strings.append(s)
 546         namespace_count = u30()
 547         for _c in range(1, namespace_count):
 548             read_bytes(1)  # kind
 549             u30()  # name
 550         ns_set_count = u30()
 551         for _c in range(1, ns_set_count):
 552             count = u30()
 553             for _c2 in range(count):
 554                 u30()
 555         multiname_count = u30()
 556         MULTINAME_SIZES = {
 557             0x07: 2,  # QName
 558             0x0d: 2,  # QNameA
 559             0x0f: 1,  # RTQName
 560             0x10: 1,  # RTQNameA
 561             0x11: 0,  # RTQNameL
 562             0x12: 0,  # RTQNameLA
 563             0x09: 2,  # Multiname
 564             0x0e: 2,  # MultinameA
 565             0x1b: 1,  # MultinameL
 566             0x1c: 1,  # MultinameLA
 567         }
 568         multinames = [u'']
 569         for _c in range(1, multiname_count):
 570             kind = u30()
 571             assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
 572             if kind == 0x07:
 573                 u30()  # namespace_idx
 574                 name_idx = u30()
 575                 multinames.append(constant_strings[name_idx])
 576             else:
 577                 multinames.append('[MULTINAME kind: %d]' % kind)
 578                 for _c2 in range(MULTINAME_SIZES[kind]):
 579                     u30()
 580
 581         # Methods
 582         method_count = u30()
 583         MethodInfo = collections.namedtuple(
 584             'MethodInfo',
 585             ['NEED_ARGUMENTS', 'NEED_REST'])
 586         method_infos = []
 587         for method_id in range(method_count):
 588             param_count = u30()
 589             u30()  # return type
 590             for _ in range(param_count):
 591                 u30()  # param type
 592             u30()  # name index (always 0 for youtube)
 593             flags = read_byte()
 594             if flags & 0x08 != 0:
 595                 # Options present
 596                 option_count = u30()
 597                 for c in range(option_count):
 598                     u30()  # val
 599                     read_bytes(1)  # kind
 600             if flags & 0x80 != 0:
 601                 # Param names present
 602                 for _ in range(param_count):
 603                     u30()  # param name
 604             mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
 605             method_infos.append(mi)
 606
 607         # Metadata
 608         metadata_count = u30()
 609         for _c in range(metadata_count):
 610             u30()  # name
 611             item_count = u30()
 612             for _c2 in range(item_count):
 613                 u30()  # key
 614                 u30()  # value
 615
 616         def parse_traits_info():
 617             trait_name_idx = u30()
 618             kind_full = read_byte()
 619             kind = kind_full & 0x0f
 620             attrs = kind_full >> 4
 621             methods = {}
 622             if kind in [0x00, 0x06]:  # Slot or Const
 623                 u30()  # Slot id
 624                 u30()  # type_name_idx
 625                 vindex = u30()
 626                 if vindex != 0:
 627                     read_byte()  # vkind
 628             elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter
 629                 u30()  # disp_id
 630                 method_idx = u30()
 631                 methods[multinames[trait_name_idx]] = method_idx
 632             elif kind == 0x04:  # Class
 633                 u30()  # slot_id
 634                 u30()  # classi
 635             elif kind == 0x05:  # Function
 636                 u30()  # slot_id
 637                 function_idx = u30()
 638                 methods[function_idx] = multinames[trait_name_idx]
 639             else:
 640                 raise ExtractorError(u'Unsupported trait kind %d' % kind)
 641
 642             if attrs & 0x4 != 0:  # Metadata present
 643                 metadata_count = u30()
 644                 for _c3 in range(metadata_count):
 645                     u30()  # metadata index
 646
 647             return methods
 648
 649         # Classes
 650         TARGET_CLASSNAME = u'SignatureDecipher'
 651         searched_idx = multinames.index(TARGET_CLASSNAME)
 652         searched_class_id = None
 653         class_count = u30()
 654         for class_id in range(class_count):
 655             name_idx = u30()
 656             if name_idx == searched_idx:
 657                 # We found the class we're looking for!
 658                 searched_class_id = class_id
 659             u30()  # super_name idx
 660             flags = read_byte()
 661             if flags & 0x08 != 0:  # Protected namespace is present
 662                 u30()  # protected_ns_idx
 663             intrf_count = u30()
 664             for _c2 in range(intrf_count):
 665                 u30()
 666             u30()  # iinit
 667             trait_count = u30()
 668             for _c2 in range(trait_count):
 669                 parse_traits_info()
 670
 671         if searched_class_id is None:
 672             raise ExtractorError(u'Target class %r not found' %
 673                                  TARGET_CLASSNAME)
 674
 675         method_names = {}
 676         method_idxs = {}
 677         for class_id in range(class_count):
 678             u30()  # cinit
 679             trait_count = u30()
 680             for _c2 in range(trait_count):
 681                 trait_methods = parse_traits_info()
 682                 if class_id == searched_class_id:
 683                     method_names.update(trait_methods.items())
 684                     method_idxs.update(dict(
 685                         (idx, name)
 686                         for name, idx in trait_methods.items()))
 687
 688         # Scripts
 689         script_count = u30()
 690         for _c in range(script_count):
 691             u30()  # init
 692             trait_count = u30()
 693             for _c2 in range(trait_count):
 694                 parse_traits_info()
 695
 696         # Method bodies
 697         method_body_count = u30()
 698         Method = collections.namedtuple('Method', ['code', 'local_count'])
 699         methods = {}
 700         for _c in range(method_body_count):
 701             method_idx = u30()
 702             u30()  # max_stack
 703             local_count = u30()
 704             u30()  # init_scope_depth
 705             u30()  # max_scope_depth
 706             code_length = u30()
 707             code = read_bytes(code_length)
 708             if method_idx in method_idxs:
 709                 m = Method(code, local_count)
 710                 methods[method_idxs[method_idx]] = m
 711             exception_count = u30()
 712             for _c2 in range(exception_count):
 713                 u30()  # from
 714                 u30()  # to
 715                 u30()  # target
 716                 u30()  # exc_type
 717                 u30()  # var_name
 718             trait_count = u30()
 719             for _c2 in range(trait_count):
 720                 parse_traits_info()
 721
 722         assert p + code_reader.tell() == len(code_tag)
 723         assert len(methods) == len(method_idxs)
 724
 725         method_pyfunctions = {}
 726
 727         def extract_function(func_name):
 728             if func_name in method_pyfunctions:
 729                 return method_pyfunctions[func_name]
 730             if func_name not in methods:
 731                 raise ExtractorError(u'Cannot find function %r' % func_name)
 732             m = methods[func_name]
 733
 734             def resfunc(args):
 735                 registers = ['(this)'] + list(args) + [None] * m.local_count
 736                 stack = []
 737                 coder = io.BytesIO(m.code)
 738                 while True:
 739                     opcode = struct.unpack('!B', coder.read(1))[0]
 740                     if opcode == 36:  # pushbyte
 741                         v = struct.unpack('!B', coder.read(1))[0]
 742                         stack.append(v)
 743                     elif opcode == 44:  # pushstring
 744                         idx = u30(coder)
 745                         stack.append(constant_strings[idx])
 746                     elif opcode == 48:  # pushscope
 747                         # We don't implement the scope register, so we'll just
 748                         # ignore the popped value
 749                         stack.pop()
 750                     elif opcode == 70:  # callproperty
 751                         index = u30(coder)
 752                         mname = multinames[index]
 753                         arg_count = u30(coder)
 754                         args = list(reversed(
 755                             [stack.pop() for _ in range(arg_count)]))
 756                         obj = stack.pop()
 757                         if mname == u'split':
 758                             assert len(args) == 1
 759                             assert isinstance(args[0], compat_str)
 760                             assert isinstance(obj, compat_str)
 761                             if args[0] == u'':
 762                                 res = list(obj)
 763                             else:
 764                                 res = obj.split(args[0])
 765                             stack.append(res)
 766                         elif mname == u'slice':
 767                             assert len(args) == 1
 768                             assert isinstance(args[0], int)
 769                             assert isinstance(obj, list)
 770                             res = obj[args[0]:]
 771                             stack.append(res)
 772                         elif mname == u'join':
 773                             assert len(args) == 1
 774                             assert isinstance(args[0], compat_str)
 775                             assert isinstance(obj, list)
 776                             res = args[0].join(obj)
 777                             stack.append(res)
 778                         elif mname in method_pyfunctions:
 779                             stack.append(method_pyfunctions[mname](args))
 780                         else:
 781                             raise NotImplementedError(
 782                                 u'Unsupported property %r on %r'
 783                                 % (mname, obj))
 784                     elif opcode == 72:  # returnvalue
 785                         res = stack.pop()
 786                         return res
 787                     elif opcode == 79:  # callpropvoid
 788                         index = u30(coder)
 789                         mname = multinames[index]
 790                         arg_count = u30(coder)
 791                         args = list(reversed(
 792                             [stack.pop() for _ in range(arg_count)]))
 793                         obj = stack.pop()
 794                         if mname == u'reverse':
 795                             assert isinstance(obj, list)
 796                             obj.reverse()
 797                         else:
 798                             raise NotImplementedError(
 799                                 u'Unsupported (void) property %r on %r'
 800                                 % (mname, obj))
 801                     elif opcode == 93:  # findpropstrict
 802                         index = u30(coder)
 803                         mname = multinames[index]
 804                         res = extract_function(mname)
 805                         stack.append(res)
 806                     elif opcode == 97:  # setproperty
 807                         index = u30(coder)
 808                         value = stack.pop()
 809                         idx = stack.pop()
 810                         obj = stack.pop()
 811                         assert isinstance(obj, list)
 812                         assert isinstance(idx, int)
 813                         obj[idx] = value
 814                     elif opcode == 98:  # getlocal
 815                         index = u30(coder)
 816                         stack.append(registers[index])
 817                     elif opcode == 99:  # setlocal
 818                         index = u30(coder)
 819                         value = stack.pop()
 820                         registers[index] = value
 821                     elif opcode == 102:  # getproperty
 822                         index = u30(coder)
 823                         pname = multinames[index]
 824                         if pname == u'length':
 825                             obj = stack.pop()
 826                             assert isinstance(obj, list)
 827                             stack.append(len(obj))
 828                         else:  # Assume attribute access
 829                             idx = stack.pop()
 830                             assert isinstance(idx, int)
 831                             obj = stack.pop()
 832                             assert isinstance(obj, list)
 833                             stack.append(obj[idx])
 834                     elif opcode == 128:  # coerce
 835                         u30(coder)
 836                     elif opcode == 133:  # coerce_s
 837                         assert isinstance(stack[-1], (type(None), compat_str))
 838                     elif opcode == 164:  # modulo
 839                         value2 = stack.pop()
 840                         value1 = stack.pop()
 841                         res = value1 % value2
 842                         stack.append(res)
 843                     elif opcode == 208:  # getlocal_0
 844                         stack.append(registers[0])
 845                     elif opcode == 209:  # getlocal_1
 846                         stack.append(registers[1])
 847                     elif opcode == 210:  # getlocal_2
 848                         stack.append(registers[2])
 849                     elif opcode == 211:  # getlocal_3
 850                         stack.append(registers[3])
 851                     elif opcode == 214:  # setlocal_2
 852                         registers[2] = stack.pop()
 853                     elif opcode == 215:  # setlocal_3
 854                         registers[3] = stack.pop()
 855                     else:
 856                         raise NotImplementedError(
 857                             u'Unsupported opcode %d' % opcode)
 858
 859             method_pyfunctions[func_name] = resfunc
 860             return resfunc
 861
 862         initial_function = extract_function(u'decipher')
 863         return lambda s: initial_function([s])
 864
 865     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 866         """Turn the encrypted s field into a working signature"""
 867
 868         if player_url is not None:
 869             if player_url.startswith(u'//'):
 870                 player_url = u'https:' + player_url
 871             try:
 872                 player_id = (player_url, len(s))
 873                 if player_id not in self._player_cache:
 874                     func = self._extract_signature_function(
 875                         video_id, player_url, len(s)
 876                     )
 877                     self._player_cache[player_id] = func
 878                 func = self._player_cache[player_id]
 879                 if self._downloader.params.get('youtube_print_sig_code'):
 880                     self._print_sig_code(func, len(s))
 881                 return func(s)
 882             except Exception as e:
 883                 tb = traceback.format_exc()
 884                 raise ExtractorError(
 885                     u'Automatic signature extraction failed: ' + tb, cause=e)
 886
 887         return self._static_decrypt_signature(
 888             s, video_id, player_url, age_gate)
 889
 890     def _get_available_subtitles(self, video_id, webpage):
 891         try:
 892             sub_list = self._download_webpage(
 893                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 894                 video_id, note=False)
 895         except ExtractorError as err:
 896             self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 897             return {}
 898         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 899
 900         sub_lang_list = {}
 901         for l in lang_list:
 902             lang = l[1]
 903             params = compat_urllib_parse.urlencode({
 904                 'lang': lang,
 905                 'v': video_id,
 906                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 907                 'name': unescapeHTML(l[0]).encode('utf-8'),
 908             })
 909             url = u'https://www.youtube.com/api/timedtext?' + params
 910             sub_lang_list[lang] = url
 911         if not sub_lang_list:
 912             self._downloader.report_warning(u'video doesn\'t have subtitles')
 913             return {}
 914         return sub_lang_list
 915
 916     def _get_available_automatic_caption(self, video_id, webpage):
 917         """We need the webpage for getting the captions url, pass it as an
 918            argument to speed up the process."""
 919         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 920         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 921         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 922         err_msg = u'Couldn\'t find automatic captions for %s' % video_id
 923         if mobj is None:
 924             self._downloader.report_warning(err_msg)
 925             return {}
 926         player_config = json.loads(mobj.group(1))
 927         try:
 928             args = player_config[u'args']
 929             caption_url = args[u'ttsurl']
 930             timestamp = args[u'timestamp']
 931             # We get the available subtitles
 932             list_params = compat_urllib_parse.urlencode({
 933                 'type': 'list',
 934                 'tlangs': 1,
 935                 'asrs': 1,
 936             })
 937             list_url = caption_url + '&' + list_params
 938             caption_list = self._download_xml(list_url, video_id)
 939             original_lang_node = caption_list.find('track')
 940             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
 941                 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
 942                 return {}
 943             original_lang = original_lang_node.attrib['lang_code']
 944
 945             sub_lang_list = {}
 946             for lang_node in caption_list.findall('target'):
 947                 sub_lang = lang_node.attrib['lang_code']
 948                 params = compat_urllib_parse.urlencode({
 949                     'lang': original_lang,
 950                     'tlang': sub_lang,
 951                     'fmt': sub_format,
 952                     'ts': timestamp,
 953                     'kind': 'asr',
 954                 })
 955                 sub_lang_list[sub_lang] = caption_url + '&' + params
 956             return sub_lang_list
 957         # An extractor error can be raise by the download process if there are
 958         # no automatic captions but there are subtitles
 959         except (KeyError, ExtractorError):
 960             self._downloader.report_warning(err_msg)
 961             return {}
 962
 963     @classmethod
 964     def extract_id(cls, url):
 965         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 966         if mobj is None:
 967             raise ExtractorError(u'Invalid URL: %s' % url)
 968         video_id = mobj.group(2)
 969         return video_id
 970
 971     def _extract_from_m3u8(self, manifest_url, video_id):
 972         url_map = {}
 973         def _get_urls(_manifest):
 974             lines = _manifest.split('\n')
 975             urls = filter(lambda l: l and not l.startswith('#'),
 976                             lines)
 977             return urls
 978         manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
 979         formats_urls = _get_urls(manifest)
 980         for format_url in formats_urls:
 981             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 982             url_map[itag] = format_url
 983         return url_map
 984
 985     def _extract_annotations(self, video_id):
 986         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 987         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
 988
 989     def _real_extract(self, url):
 990         proto = (
 991             u'http' if self._downloader.params.get('prefer_insecure', False)
 992             else u'https')
 993
 994         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 995         mobj = re.search(self._NEXT_URL_RE, url)
 996         if mobj:
 997             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 998         video_id = self.extract_id(url)
 999
1000         # Get video webpage
1001         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1002         video_webpage = self._download_webpage(url, video_id)
1003
1004         # Attempt to extract SWF player URL
1005         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1006         if mobj is not None:
1007             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1008         else:
1009             player_url = None
1010
1011         # Get video info
1012         self.report_video_info_webpage_download(video_id)
1013         if re.search(r'player-age-gate-content">', video_webpage) is not None:
1014             self.report_age_confirmation()
1015             age_gate = True
1016             # We simulate the access to the video from www.youtube.com/v/{video_id}
1017             # this can be viewed without login into Youtube
1018             data = compat_urllib_parse.urlencode({'video_id': video_id,
1019                                                   'el': 'player_embedded',
1020                                                   'gl': 'US',
1021                                                   'hl': 'en',
1022                                                   'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1023                                                   'asv': 3,
1024                                                   'sts':'1588',
1025                                                   })
1026             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1027             video_info_webpage = self._download_webpage(video_info_url, video_id,
1028                                     note=False,
1029                                     errnote='unable to download video info webpage')
1030             video_info = compat_parse_qs(video_info_webpage)
1031         else:
1032             age_gate = False
1033             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1034                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1035                         % (video_id, el_type))
1036                 video_info_webpage = self._download_webpage(video_info_url, video_id,
1037                                         note=False,
1038                                         errnote='unable to download video info webpage')
1039                 video_info = compat_parse_qs(video_info_webpage)
1040                 if 'token' in video_info:
1041                     break
1042         if 'token' not in video_info:
1043             if 'reason' in video_info:
1044                 raise ExtractorError(
1045                     u'YouTube said: %s' % video_info['reason'][0],
1046                     expected=True, video_id=video_id)
1047             else:
1048                 raise ExtractorError(
1049                     u'"token" parameter not in video info for unknown reason',
1050                     video_id=video_id)
1051
1052         if 'view_count' in video_info:
1053             view_count = int(video_info['view_count'][0])
1054         else:
1055             view_count = None
1056
1057         # Check for "rental" videos
1058         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1059             raise ExtractorError(u'"rental" videos not supported')
1060
1061         # Start extracting information
1062         self.report_information_extraction(video_id)
1063
1064         # uploader
1065         if 'author' not in video_info:
1066             raise ExtractorError(u'Unable to extract uploader name')
1067         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1068
1069         # uploader_id
1070         video_uploader_id = None
1071         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1072         if mobj is not None:
1073             video_uploader_id = mobj.group(1)
1074         else:
1075             self._downloader.report_warning(u'unable to extract uploader nickname')
1076
1077         # title
1078         if 'title' in video_info:
1079             video_title = video_info['title'][0]
1080         else:
1081             self._downloader.report_warning(u'Unable to extract video title')
1082             video_title = u'_'
1083
1084         # thumbnail image
1085         # We try first to get a high quality image:
1086         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1087                             video_webpage, re.DOTALL)
1088         if m_thumb is not None:
1089             video_thumbnail = m_thumb.group(1)
1090         elif 'thumbnail_url' not in video_info:
1091             self._downloader.report_warning(u'unable to extract video thumbnail')
1092             video_thumbnail = None
1093         else:   # don't panic if we can't find it
1094             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1095
1096         # upload date
1097         upload_date = None
1098         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
1099         if mobj is None:
1100             mobj = re.search(
1101                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
1102                 video_webpage)
1103         if mobj is not None:
1104             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1105             upload_date = unified_strdate(upload_date)
1106
1107         m_cat_container = get_element_by_id("eow-category", video_webpage)
1108         if m_cat_container:
1109             category = self._html_search_regex(
1110                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1111                 default=None)
1112             video_categories = None if category is None else [category]
1113         else:
1114             video_categories = None
1115
1116         # description
1117         video_description = get_element_by_id("eow-description", video_webpage)
1118         if video_description:
1119             video_description = re.sub(r'''(?x)
1120                 <a\s+
1121                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1122                     title="([^"]+)"\s+
1123                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1124                     class="yt-uix-redirect-link"\s*>
1125                 [^<]+
1126                 </a>
1127             ''', r'\1', video_description)
1128             video_description = clean_html(video_description)
1129         else:
1130             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1131             if fd_mobj:
1132                 video_description = unescapeHTML(fd_mobj.group(1))
1133             else:
1134                 video_description = u''
1135
1136         def _extract_count(klass):
1137             count = self._search_regex(
1138                 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1139                 video_webpage, klass, default=None)
1140             if count is not None:
1141                 return int(count.replace(',', ''))
1142             return None
1143         like_count = _extract_count(u'likes-count')
1144         dislike_count = _extract_count(u'dislikes-count')
1145
1146         # subtitles
1147         video_subtitles = self.extract_subtitles(video_id, video_webpage)
1148
1149         if self._downloader.params.get('listsubtitles', False):
1150             self._list_available_subtitles(video_id, video_webpage)
1151             return
1152
1153         if 'length_seconds' not in video_info:
1154             self._downloader.report_warning(u'unable to extract video duration')
1155             video_duration = None
1156         else:
1157             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1158
1159         # annotations
1160         video_annotations = None
1161         if self._downloader.params.get('writeannotations', False):
1162                 video_annotations = self._extract_annotations(video_id)
1163
1164         # Decide which formats to download
1165         try:
1166             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1167             if not mobj:
1168                 raise ValueError('Could not find vevo ID')
1169             json_code = uppercase_escape(mobj.group(1))
1170             ytplayer_config = json.loads(json_code)
1171             args = ytplayer_config['args']
1172             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1173             # this signatures are encrypted
1174             if 'url_encoded_fmt_stream_map' not in args:
1175                 raise ValueError(u'No stream_map present')  # caught below
1176             re_signature = re.compile(r'[&,]s=')
1177             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1178             if m_s is not None:
1179                 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1180                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1181             m_s = re_signature.search(args.get('adaptive_fmts', u''))
1182             if m_s is not None:
1183                 if 'adaptive_fmts' in video_info:
1184                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1185                 else:
1186                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1187         except ValueError:
1188             pass
1189
1190         def _map_to_format_list(urlmap):
1191             formats = []
1192             for itag, video_real_url in urlmap.items():
1193                 dct = {
1194                     'format_id': itag,
1195                     'url': video_real_url,
1196                     'player_url': player_url,
1197                 }
1198                 if itag in self._formats:
1199                     dct.update(self._formats[itag])
1200                 formats.append(dct)
1201             return formats
1202
1203         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1204             self.report_rtmp_download()
1205             formats = [{
1206                 'format_id': '_rtmp',
1207                 'protocol': 'rtmp',
1208                 'url': video_info['conn'][0],
1209                 'player_url': player_url,
1210             }]
1211         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1212             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1213             if 'rtmpe%3Dyes' in encoded_url_map:
1214                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1215             url_map = {}
1216             for url_data_str in encoded_url_map.split(','):
1217                 url_data = compat_parse_qs(url_data_str)
1218                 if 'itag' in url_data and 'url' in url_data:
1219                     url = url_data['url'][0]
1220                     if 'sig' in url_data:
1221                         url += '&signature=' + url_data['sig'][0]
1222                     elif 's' in url_data:
1223                         encrypted_sig = url_data['s'][0]
1224                         if self._downloader.params.get('verbose'):
1225                             if age_gate:
1226                                 if player_url is None:
1227                                     player_version = 'unknown'
1228                                 else:
1229                                     player_version = self._search_regex(
1230                                         r'-(.+)\.swf$', player_url,
1231                                         u'flash player', fatal=False)
1232                                 player_desc = 'flash player %s' % player_version
1233                             else:
1234                                 player_version = self._search_regex(
1235                                     r'html5player-(.+?)\.js', video_webpage,
1236                                     'html5 player', fatal=False)
1237                                 player_desc = u'html5 player %s' % player_version
1238
1239                             parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1240                             self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1241                                 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1242
1243                         if not age_gate:
1244                             jsplayer_url_json = self._search_regex(
1245                                 r'"assets":.+?"js":\s*("[^"]+")',
1246                                 video_webpage, u'JS player URL')
1247                             player_url = json.loads(jsplayer_url_json)
1248
1249                         signature = self._decrypt_signature(
1250                             encrypted_sig, video_id, player_url, age_gate)
1251                         url += '&signature=' + signature
1252                     if 'ratebypass' not in url:
1253                         url += '&ratebypass=yes'
1254                     url_map[url_data['itag'][0]] = url
1255             formats = _map_to_format_list(url_map)
1256         elif video_info.get('hlsvp'):
1257             manifest_url = video_info['hlsvp'][0]
1258             url_map = self._extract_from_m3u8(manifest_url, video_id)
1259             formats = _map_to_format_list(url_map)
1260         else:
1261             raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1262
1263         # Look for the DASH manifest
1264         if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1265             try:
1266                 # The DASH manifest used needs to be the one from the original video_webpage.
1267                 # The one found in get_video_info seems to be using different signatures.
1268                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1269                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1270                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1271                 if age_gate:
1272                     dash_manifest_url = video_info.get('dashmpd')[0]
1273                 else:
1274                     dash_manifest_url = ytplayer_config['args']['dashmpd']
1275                 def decrypt_sig(mobj):
1276                     s = mobj.group(1)
1277                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1278                     return '/signature/%s' % dec_s
1279                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1280                 dash_doc = self._download_xml(
1281                     dash_manifest_url, video_id,
1282                     note=u'Downloading DASH manifest',
1283                     errnote=u'Could not download DASH manifest')
1284                 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1285                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1286                     if url_el is None:
1287                         continue
1288                     format_id = r.attrib['id']
1289                     video_url = url_el.text
1290                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1291                     f = {
1292                         'format_id': format_id,
1293                         'url': video_url,
1294                         'width': int_or_none(r.attrib.get('width')),
1295                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1296                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1297                         'filesize': filesize,
1298                     }
1299                     try:
1300                         existing_format = next(
1301                             fo for fo in formats
1302                             if fo['format_id'] == format_id)
1303                     except StopIteration:
1304                         f.update(self._formats.get(format_id, {}))
1305                         formats.append(f)
1306                     else:
1307                         existing_format.update(f)
1308
1309             except (ExtractorError, KeyError) as e:
1310                 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1311
1312         self._sort_formats(formats)
1313
1314         return {
1315             'id':           video_id,
1316             'uploader':     video_uploader,
1317             'uploader_id':  video_uploader_id,
1318             'upload_date':  upload_date,
1319             'title':        video_title,
1320             'thumbnail':    video_thumbnail,
1321             'description':  video_description,
1322             'categories':   video_categories,
1323             'subtitles':    video_subtitles,
1324             'duration':     video_duration,
1325             'age_limit':    18 if age_gate else 0,
1326             'annotations':  video_annotations,
1327             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1328             'view_count':   view_count,
1329             'like_count': like_count,
1330             'dislike_count': dislike_count,
1331             'formats':      formats,
1332         }
1333
1334 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1335     IE_DESC = u'YouTube.com playlists'
1336     _VALID_URL = r"""(?x)(?:
1337                         (?:https?://)?
1338                         (?:\w+\.)?
1339                         youtube\.com/
1340                         (?:
1341                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1342                            \? (?:.*?&)*? (?:p|a|list)=
1343                         |  p/
1344                         )
1345                         (
1346                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1347                             # Top tracks, they can also include dots
1348                             |(?:MC)[\w\.]*
1349                         )
1350                         .*
1351                      |
1352                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1353                      )"""
1354     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1355     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1356     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1357     IE_NAME = u'youtube:playlist'
1358
1359     def _real_initialize(self):
1360         self._login()
1361
1362     def _ids_to_results(self, ids):
1363         return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1364                        for vid_id in ids]
1365
1366     def _extract_mix(self, playlist_id):
1367         # The mixes are generated from a a single video
1368         # the id of the playlist is just 'RD' + video_id
1369         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1370         webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1371         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1372         title_span = (search_title('playlist-title') or
1373             search_title('title long-title') or search_title('title'))
1374         title = clean_html(title_span)
1375         video_re = r'''(?x)data-video-username=".*?".*?
1376                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1377         ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1378         url_results = self._ids_to_results(ids)
1379
1380         return self.playlist_result(url_results, playlist_id, title)
1381
1382     def _real_extract(self, url):
1383         # Extract playlist id
1384         mobj = re.match(self._VALID_URL, url)
1385         if mobj is None:
1386             raise ExtractorError(u'Invalid URL: %s' % url)
1387         playlist_id = mobj.group(1) or mobj.group(2)
1388
1389         # Check if it's a video-specific URL
1390         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1391         if 'v' in query_dict:
1392             video_id = query_dict['v'][0]
1393             if self._downloader.params.get('noplaylist'):
1394                 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1395                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1396             else:
1397                 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1398
1399         if playlist_id.startswith('RD'):
1400             # Mixes require a custom extraction process
1401             return self._extract_mix(playlist_id)
1402         if playlist_id.startswith('TL'):
1403             raise ExtractorError(u'For downloading YouTube.com top lists, use '
1404                 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1405
1406         url = self._TEMPLATE_URL % playlist_id
1407         page = self._download_webpage(url, playlist_id)
1408         more_widget_html = content_html = page
1409
1410         # Check if the playlist exists or is private
1411         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1412             raise ExtractorError(
1413                 u'The playlist doesn\'t exist or is private, use --username or '
1414                 '--netrc to access it.',
1415                 expected=True)
1416
1417         # Extract the video ids from the playlist pages
1418         ids = []
1419
1420         for page_num in itertools.count(1):
1421             matches = re.finditer(self._VIDEO_RE, content_html)
1422             # We remove the duplicates and the link with index 0
1423             # (it's not the first video of the playlist)
1424             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1425             ids.extend(new_ids)
1426
1427             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1428             if not mobj:
1429                 break
1430
1431             more = self._download_json(
1432                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1433                 'Downloading page #%s' % page_num,
1434                 transform_source=uppercase_escape)
1435             content_html = more['content_html']
1436             more_widget_html = more['load_more_widget_html']
1437
1438         playlist_title = self._html_search_regex(
1439             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1440             page, u'title')
1441
1442         url_results = self._ids_to_results(ids)
1443         return self.playlist_result(url_results, playlist_id, playlist_title)
1444
1445
1446 class YoutubeTopListIE(YoutubePlaylistIE):
1447     IE_NAME = u'youtube:toplist'
1448     IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1449         u' (Example: "yttoplist:music:Top Tracks")')
1450     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1451
1452     def _real_extract(self, url):
1453         mobj = re.match(self._VALID_URL, url)
1454         channel = mobj.group('chann')
1455         title = mobj.group('title')
1456         query = compat_urllib_parse.urlencode({'title': title})
1457         playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1458         channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1459         link = self._html_search_regex(playlist_re, channel_page, u'list')
1460         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1461
1462         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1463         ids = []
1464         # sometimes the webpage doesn't contain the videos
1465         # retry until we get them
1466         for i in itertools.count(0):
1467             msg = u'Downloading Youtube mix'
1468             if i > 0:
1469                 msg += ', retry #%d' % i
1470             webpage = self._download_webpage(url, title, msg)
1471             ids = orderedSet(re.findall(video_re, webpage))
1472             if ids:
1473                 break
1474         url_results = self._ids_to_results(ids)
1475         return self.playlist_result(url_results, playlist_title=title)
1476
1477
1478 class YoutubeChannelIE(InfoExtractor):
1479     IE_DESC = u'YouTube.com channels'
1480     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1481     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1482     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1483     IE_NAME = u'youtube:channel'
1484
1485     def extract_videos_from_page(self, page):
1486         ids_in_page = []
1487         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1488             if mobj.group(1) not in ids_in_page:
1489                 ids_in_page.append(mobj.group(1))
1490         return ids_in_page
1491
1492     def _real_extract(self, url):
1493         # Extract channel id
1494         mobj = re.match(self._VALID_URL, url)
1495         if mobj is None:
1496             raise ExtractorError(u'Invalid URL: %s' % url)
1497
1498         # Download channel page
1499         channel_id = mobj.group(1)
1500         video_ids = []
1501         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1502         channel_page = self._download_webpage(url, channel_id)
1503         autogenerated = re.search(r'''(?x)
1504                 class="[^"]*?(?:
1505                     channel-header-autogenerated-label|
1506                     yt-channel-title-autogenerated
1507                 )[^"]*"''', channel_page) is not None
1508
1509         if autogenerated:
1510             # The videos are contained in a single page
1511             # the ajax pages can't be used, they are empty
1512             video_ids = self.extract_videos_from_page(channel_page)
1513         else:
1514             # Download all channel pages using the json-based channel_ajax query
1515             for pagenum in itertools.count(1):
1516                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1517                 page = self._download_json(
1518                     url, channel_id, note=u'Downloading page #%s' % pagenum,
1519                     transform_source=uppercase_escape)
1520
1521                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1522                 video_ids.extend(ids_in_page)
1523
1524                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1525                     break
1526
1527         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1528
1529         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1530                        for video_id in video_ids]
1531         return self.playlist_result(url_entries, channel_id)
1532
1533
1534 class YoutubeUserIE(InfoExtractor):
1535     IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1536     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1537     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1538     _GDATA_PAGE_SIZE = 50
1539     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1540     IE_NAME = u'youtube:user'
1541
1542     @classmethod
1543     def suitable(cls, url):
1544         # Don't return True if the url can be extracted with other youtube
1545         # extractor, the regex would is too permissive and it would match.
1546         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1547         if any(ie.suitable(url) for ie in other_ies): return False
1548         else: return super(YoutubeUserIE, cls).suitable(url)
1549
1550     def _real_extract(self, url):
1551         # Extract username
1552         mobj = re.match(self._VALID_URL, url)
1553         if mobj is None:
1554             raise ExtractorError(u'Invalid URL: %s' % url)
1555
1556         username = mobj.group(1)
1557
1558         # Download video ids using YouTube Data API. Result size per
1559         # query is limited (currently to 50 videos) so we need to query
1560         # page by page until there are no video ids - it means we got
1561         # all of them.
1562
1563         def download_page(pagenum):
1564             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1565
1566             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1567             page = self._download_webpage(
1568                 gdata_url, username,
1569                 u'Downloading video ids from %d to %d' % (
1570                     start_index, start_index + self._GDATA_PAGE_SIZE))
1571
1572             try:
1573                 response = json.loads(page)
1574             except ValueError as err:
1575                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1576             if 'entry' not in response['feed']:
1577                 return
1578
1579             # Extract video identifiers
1580             entries = response['feed']['entry']
1581             for entry in entries:
1582                 title = entry['title']['$t']
1583                 video_id = entry['id']['$t'].split('/')[-1]
1584                 yield {
1585                     '_type': 'url',
1586                     'url': video_id,
1587                     'ie_key': 'Youtube',
1588                     'id': video_id,
1589                     'title': title,
1590                 }
1591         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1592
1593         return self.playlist_result(url_results, playlist_title=username)
1594
1595
1596 class YoutubeSearchIE(SearchInfoExtractor):
1597     IE_DESC = u'YouTube.com searches'
1598     _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1599     _MAX_RESULTS = 1000
1600     IE_NAME = u'youtube:search'
1601     _SEARCH_KEY = 'ytsearch'
1602
1603     def _get_n_results(self, query, n):
1604         """Get a specified number of results for a query"""
1605
1606         video_ids = []
1607         pagenum = 0
1608         limit = n
1609         PAGE_SIZE = 50
1610
1611         while (PAGE_SIZE * pagenum) < limit:
1612             result_url = self._API_URL % (
1613                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1614                 (PAGE_SIZE * pagenum) + 1)
1615             data_json = self._download_webpage(
1616                 result_url, video_id=u'query "%s"' % query,
1617                 note=u'Downloading page %s' % (pagenum + 1),
1618                 errnote=u'Unable to download API page')
1619             data = json.loads(data_json)
1620             api_response = data['data']
1621
1622             if 'items' not in api_response:
1623                 raise ExtractorError(
1624                     u'[youtube] No video results', expected=True)
1625
1626             new_ids = list(video['id'] for video in api_response['items'])
1627             video_ids += new_ids
1628
1629             limit = min(n, api_response['totalItems'])
1630             pagenum += 1
1631
1632         if len(video_ids) > n:
1633             video_ids = video_ids[:n]
1634         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1635                   for video_id in video_ids]
1636         return self.playlist_result(videos, query)
1637
1638
1639 class YoutubeSearchDateIE(YoutubeSearchIE):
1640     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1641     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1642     _SEARCH_KEY = 'ytsearchdate'
1643     IE_DESC = u'YouTube.com searches, newest videos first'
1644
1645
1646 class YoutubeSearchURLIE(InfoExtractor):
1647     IE_DESC = u'YouTube.com search URLs'
1648     IE_NAME = u'youtube:search_url'
1649     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1650
1651     def _real_extract(self, url):
1652         mobj = re.match(self._VALID_URL, url)
1653         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1654
1655         webpage = self._download_webpage(url, query)
1656         result_code = self._search_regex(
1657             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
1658
1659         part_codes = re.findall(
1660             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1661         entries = []
1662         for part_code in part_codes:
1663             part_title = self._html_search_regex(
1664                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1665             part_url_snippet = self._html_search_regex(
1666                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1667             part_url = compat_urlparse.urljoin(
1668                 'https://www.youtube.com/', part_url_snippet)
1669             entries.append({
1670                 '_type': 'url',
1671                 'url': part_url,
1672                 'title': part_title,
1673             })
1674
1675         return {
1676             '_type': 'playlist',
1677             'entries': entries,
1678             'title': query,
1679         }
1680
1681
1682 class YoutubeShowIE(InfoExtractor):
1683     IE_DESC = u'YouTube.com (multi-season) shows'
1684     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1685     IE_NAME = u'youtube:show'
1686
1687     def _real_extract(self, url):
1688         mobj = re.match(self._VALID_URL, url)
1689         show_name = mobj.group(1)
1690         webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1691         # There's one playlist for each season of the show
1692         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1693         self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1694         return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1695
1696
1697 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1698     """
1699     Base class for extractors that fetch info from
1700     http://www.youtube.com/feed_ajax
1701     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1702     """
1703     _LOGIN_REQUIRED = True
1704     # use action_load_personal_feed instead of action_load_system_feed
1705     _PERSONAL_FEED = False
1706
1707     @property
1708     def _FEED_TEMPLATE(self):
1709         action = 'action_load_system_feed'
1710         if self._PERSONAL_FEED:
1711             action = 'action_load_personal_feed'
1712         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1713
1714     @property
1715     def IE_NAME(self):
1716         return u'youtube:%s' % self._FEED_NAME
1717
1718     def _real_initialize(self):
1719         self._login()
1720
1721     def _real_extract(self, url):
1722         feed_entries = []
1723         paging = 0
1724         for i in itertools.count(1):
1725             info = self._download_json(self._FEED_TEMPLATE % paging,
1726                                           u'%s feed' % self._FEED_NAME,
1727                                           u'Downloading page %s' % i)
1728             feed_html = info.get('feed_html') or info.get('content_html')
1729             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1730             ids = orderedSet(m.group(1) for m in m_ids)
1731             feed_entries.extend(
1732                 self.url_result(video_id, 'Youtube', video_id=video_id)
1733                 for video_id in ids)
1734             mobj = re.search(
1735                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1736                 feed_html)
1737             if mobj is None:
1738                 break
1739             paging = mobj.group('paging')
1740         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1741
1742 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1743     IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1744     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1745     _FEED_NAME = 'subscriptions'
1746     _PLAYLIST_TITLE = u'Youtube Subscriptions'
1747
1748 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1749     IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1750     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1751     _FEED_NAME = 'recommended'
1752     _PLAYLIST_TITLE = u'Youtube Recommended videos'
1753
1754 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1755     IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1756     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1757     _FEED_NAME = 'watch_later'
1758     _PLAYLIST_TITLE = u'Youtube Watch Later'
1759     _PERSONAL_FEED = True
1760
1761 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1762     IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1763     _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1764     _FEED_NAME = 'history'
1765     _PERSONAL_FEED = True
1766     _PLAYLIST_TITLE = u'Youtube Watch History'
1767
1768 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1769     IE_NAME = u'youtube:favorites'
1770     IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1771     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1772     _LOGIN_REQUIRED = True
1773
1774     def _real_extract(self, url):
1775         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1776         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1777         return self.url_result(playlist_id, 'YoutubePlaylist')
1778
1779
1780 class YoutubeTruncatedURLIE(InfoExtractor):
1781     IE_NAME = 'youtube:truncated_url'
1782     IE_DESC = False  # Do not list
1783     _VALID_URL = r'''(?x)
1784         (?:https?://)?[^/]+/watch\?(?:
1785             feature=[a-z_]+|
1786             annotation_id=annotation_[^&]+
1787         )?$|
1788         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1789     '''
1790
1791     _TESTS = [{
1792         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1793         'only_matching': True,
1794     }, {
1795         'url': 'http://www.youtube.com/watch?',
1796         'only_matching': True,
1797     }]
1798
1799     def _real_extract(self, url):
1800         raise ExtractorError(
1801             u'Did you forget to quote the URL? Remember that & is a meta '
1802             u'character in most shells, so you want to put the URL in quotes, '
1803             u'like  youtube-dl '
1804             u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1805             u' or simply  youtube-dl BaW_jenozKc  .',
1806             expected=True)