_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 import collections
   4 import errno
   5 import io
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import struct
  11 import traceback
  12 import zlib
  13
  14 from .common import InfoExtractor, SearchInfoExtractor
  15 from .subtitles import SubtitlesInfoExtractor
  16 from ..jsinterp import JSInterpreter
  17 from ..utils import (
  18     compat_chr,
  19     compat_parse_qs,
  20     compat_urllib_parse,
  21     compat_urllib_request,
  22     compat_urlparse,
  23     compat_str,
  24
  25     clean_html,
  26     get_cachedir,
  27     get_element_by_id,
  28     get_element_by_attribute,
  29     ExtractorError,
  30     int_or_none,
  31     PagedList,
  32     unescapeHTML,
  33     unified_strdate,
  34     orderedSet,
  35     write_json_file,
  36     uppercase_escape,
  37 )
  38
  39 class YoutubeBaseInfoExtractor(InfoExtractor):
  40     """Provide base functions for Youtube extractors"""
  41     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  42     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  43     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  44     _NETRC_MACHINE = 'youtube'
  45     # If True it will raise an error if no login info is provided
  46     _LOGIN_REQUIRED = False
  47
  48     def _set_language(self):
  49         return bool(self._download_webpage(
  50             self._LANG_URL, None,
  51             note=u'Setting language', errnote='unable to set language',
  52             fatal=False))
  53
  54     def _login(self):
  55         (username, password) = self._get_login_info()
  56         # No authentication to be performed
  57         if username is None:
  58             if self._LOGIN_REQUIRED:
  59                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  60             return False
  61
  62         login_page = self._download_webpage(
  63             self._LOGIN_URL, None,
  64             note=u'Downloading login page',
  65             errnote=u'unable to fetch login page', fatal=False)
  66         if login_page is False:
  67             return
  68
  69         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  70                                   login_page, u'Login GALX parameter')
  71
  72         # Log in
  73         login_form_strs = {
  74                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  75                 u'Email': username,
  76                 u'GALX': galx,
  77                 u'Passwd': password,
  78                 u'PersistentCookie': u'yes',
  79                 u'_utf8': u'霱',
  80                 u'bgresponse': u'js_disabled',
  81                 u'checkConnection': u'',
  82                 u'checkedDomains': u'youtube',
  83                 u'dnConn': u'',
  84                 u'pstMsg': u'0',
  85                 u'rmShown': u'1',
  86                 u'secTok': u'',
  87                 u'signIn': u'Sign in',
  88                 u'timeStmp': u'',
  89                 u'service': u'youtube',
  90                 u'uilel': u'3',
  91                 u'hl': u'en_US',
  92         }
  93         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
  94         # chokes on unicode
  95         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
  96         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
  97
  98         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
  99         login_results = self._download_webpage(
 100             req, None,
 101             note=u'Logging in', errnote=u'unable to log in', fatal=False)
 102         if login_results is False:
 103             return False
 104         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 105             self._downloader.report_warning(u'unable to log in: bad username or password')
 106             return False
 107         return True
 108
 109     def _confirm_age(self):
 110         age_form = {
 111             'next_url': '/',
 112             'action_confirm': 'Confirm',
 113         }
 114         req = compat_urllib_request.Request(self._AGE_URL,
 115             compat_urllib_parse.urlencode(age_form).encode('ascii'))
 116
 117         self._download_webpage(
 118             req, None,
 119             note=u'Confirming age', errnote=u'Unable to confirm age')
 120         return True
 121
 122     def _real_initialize(self):
 123         if self._downloader is None:
 124             return
 125         if not self._set_language():
 126             return
 127         if not self._login():
 128             return
 129         self._confirm_age()
 130
 131
 132 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 133     IE_DESC = u'YouTube.com'
 134     _VALID_URL = r"""(?x)^
 135                      (
 136                          (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
 137                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 138                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 139                             (?:www\.)?pwnyoutube\.com/|
 140                             (?:www\.)?yourepeat\.com/|
 141                             tube\.majestyc\.net/|
 142                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 143                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 144                          (?:                                                  # the various things that can precede the ID:
 145                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 146                              |(?:                                             # or the v= param in all its forms
 147                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 148                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 149                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 150                                  v=
 151                              )
 152                          ))
 153                          |youtu\.be/                                          # just youtu.be/xxxx
 154                          |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 155                          )
 156                      )?                                                       # all until now is optional -> you can pass the naked ID
 157                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 158                      (?(1).+)?                                                # if we found the ID, everything can follow
 159                      $"""
 160     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 161     _formats = {
 162         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 163         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 164         '13': {'ext': '3gp'},
 165         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 166         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 167         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 168         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 169         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 170         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 171         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 172         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 173         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 174         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 175         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 176         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 177
 178
 179         # 3d videos
 180         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 181         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 182         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 183         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 184         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 185         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 186         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 187
 188         # Apple HTTP Live Streaming
 189         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 190         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 191         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 192         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 193         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 194         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 195         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 196
 197         # DASH mp4 video
 198         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 199         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 200         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 201         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 202         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 203         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 204         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 205         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 206
 207         # Dash mp4 audio
 208         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 209         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 210         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 211
 212         # Dash webm
 213         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 214         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 215         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 216         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 217         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 218         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 219         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 220         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 221         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 222         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 223         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 224         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 225         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 226         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 227         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 228
 229         # Dash webm audio
 230         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
 231         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 232
 233         # RTMP (unnamed)
 234         '_rtmp': {'protocol': 'rtmp'},
 235     }
 236
 237     IE_NAME = u'youtube'
 238     _TESTS = [
 239         {
 240             u"url":  u"http://www.youtube.com/watch?v=BaW_jenozKc",
 241             u"file":  u"BaW_jenozKc.mp4",
 242             u"info_dict": {
 243                 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
 244                 u"uploader": u"Philipp Hagemeister",
 245                 u"uploader_id": u"phihag",
 246                 u"upload_date": u"20121002",
 247                 u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
 248                 u"categories": [u'Science & Technology'],
 249             }
 250         },
 251         {
 252             u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
 253             u"file":  u"UxxajLWwzqY.mp4",
 254             u"note": u"Test generic use_cipher_signature video (#897)",
 255             u"info_dict": {
 256                 u"upload_date": u"20120506",
 257                 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
 258                 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
 259                 u"uploader": u"Icona Pop",
 260                 u"uploader_id": u"IconaPop"
 261             }
 262         },
 263         {
 264             u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
 265             u"file":  u"07FYdnEawAQ.mp4",
 266             u"note": u"Test VEVO video with age protection (#956)",
 267             u"info_dict": {
 268                 u"upload_date": u"20130703",
 269                 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
 270                 u"description": u"md5:64249768eec3bc4276236606ea996373",
 271                 u"uploader": u"justintimberlakeVEVO",
 272                 u"uploader_id": u"justintimberlakeVEVO"
 273             }
 274         },
 275         {
 276             u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
 277             u"file":  u"yZIXLfi8CZQ.mp4",
 278             u"note": u"Embed-only video (#1746)",
 279             u"info_dict": {
 280                 u"upload_date": u"20120608",
 281                 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
 282                 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
 283                 u"uploader": u"SET India",
 284                 u"uploader_id": u"setindia"
 285             }
 286         },
 287         {
 288             u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
 289             u"file": u"a9LDPn-MO4I.m4a",
 290             u"note": u"256k DASH audio (format 141) via DASH manifest",
 291             u"info_dict": {
 292                 u"upload_date": "20121002",
 293                 u"uploader_id": "8KVIDEO",
 294                 u"description": "No description available.",
 295                 u"uploader": "8KVIDEO",
 296                 u"title": "UHDTV TEST 8K VIDEO.mp4"
 297             },
 298             u"params": {
 299                 u"youtube_include_dash_manifest": True,
 300                 u"format": "141",
 301             },
 302         },
 303         # DASH manifest with encrypted signature
 304         {
 305             u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 306             u'info_dict': {
 307                 u'id': u'IB3lcPjvWLA',
 308                 u'ext': u'm4a',
 309                 u'title': u'Afrojack - The Spark ft. Spree Wilson',
 310                 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
 311                 u'uploader': u'AfrojackVEVO',
 312                 u'uploader_id': u'AfrojackVEVO',
 313                 u'upload_date': u'20131011',
 314             },
 315             u"params": {
 316                 u'youtube_include_dash_manifest': True,
 317                 u'format': '141',
 318             },
 319         },
 320     ]
 321
 322
 323     @classmethod
 324     def suitable(cls, url):
 325         """Receives a URL and returns True if suitable for this IE."""
 326         if YoutubePlaylistIE.suitable(url): return False
 327         return re.match(cls._VALID_URL, url) is not None
 328
 329     def __init__(self, *args, **kwargs):
 330         super(YoutubeIE, self).__init__(*args, **kwargs)
 331         self._player_cache = {}
 332
 333     def report_video_info_webpage_download(self, video_id):
 334         """Report attempt to download video info webpage."""
 335         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 336
 337     def report_information_extraction(self, video_id):
 338         """Report attempt to extract video information."""
 339         self.to_screen(u'%s: Extracting video information' % video_id)
 340
 341     def report_unavailable_format(self, video_id, format):
 342         """Report extracted video URL."""
 343         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 344
 345     def report_rtmp_download(self):
 346         """Indicate the download will use the RTMP protocol."""
 347         self.to_screen(u'RTMP download detected')
 348
 349     def _extract_signature_function(self, video_id, player_url, slen):
 350         id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
 351                         player_url)
 352         player_type = id_m.group('ext')
 353         player_id = id_m.group('id')
 354
 355         # Read from filesystem cache
 356         func_id = '%s_%s_%d' % (player_type, player_id, slen)
 357         assert os.path.basename(func_id) == func_id
 358         cache_dir = get_cachedir(self._downloader.params)
 359
 360         cache_enabled = cache_dir is not None
 361         if cache_enabled:
 362             cache_fn = os.path.join(os.path.expanduser(cache_dir),
 363                                     u'youtube-sigfuncs',
 364                                     func_id + '.json')
 365             try:
 366                 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
 367                     cache_spec = json.load(cachef)
 368                 return lambda s: u''.join(s[i] for i in cache_spec)
 369             except IOError:
 370                 pass  # No cache available
 371
 372         if player_type == 'js':
 373             code = self._download_webpage(
 374                 player_url, video_id,
 375                 note=u'Downloading %s player %s' % (player_type, player_id),
 376                 errnote=u'Download of %s failed' % player_url)
 377             res = self._parse_sig_js(code)
 378         elif player_type == 'swf':
 379             urlh = self._request_webpage(
 380                 player_url, video_id,
 381                 note=u'Downloading %s player %s' % (player_type, player_id),
 382                 errnote=u'Download of %s failed' % player_url)
 383             code = urlh.read()
 384             res = self._parse_sig_swf(code)
 385         else:
 386             assert False, 'Invalid player type %r' % player_type
 387
 388         if cache_enabled:
 389             try:
 390                 test_string = u''.join(map(compat_chr, range(slen)))
 391                 cache_res = res(test_string)
 392                 cache_spec = [ord(c) for c in cache_res]
 393                 try:
 394                     os.makedirs(os.path.dirname(cache_fn))
 395                 except OSError as ose:
 396                     if ose.errno != errno.EEXIST:
 397                         raise
 398                 write_json_file(cache_spec, cache_fn)
 399             except Exception:
 400                 tb = traceback.format_exc()
 401                 self._downloader.report_warning(
 402                     u'Writing cache to %r failed: %s' % (cache_fn, tb))
 403
 404         return res
 405
 406     def _print_sig_code(self, func, slen):
 407         def gen_sig_code(idxs):
 408             def _genslice(start, end, step):
 409                 starts = u'' if start == 0 else str(start)
 410                 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
 411                 steps = u'' if step == 1 else (u':%d' % step)
 412                 return u's[%s%s%s]' % (starts, ends, steps)
 413
 414             step = None
 415             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 416                                     # set as soon as step is set
 417             for i, prev in zip(idxs[1:], idxs[:-1]):
 418                 if step is not None:
 419                     if i - prev == step:
 420                         continue
 421                     yield _genslice(start, prev, step)
 422                     step = None
 423                     continue
 424                 if i - prev in [-1, 1]:
 425                     step = i - prev
 426                     start = prev
 427                     continue
 428                 else:
 429                     yield u's[%d]' % prev
 430             if step is None:
 431                 yield u's[%d]' % i
 432             else:
 433                 yield _genslice(start, i, step)
 434
 435         test_string = u''.join(map(compat_chr, range(slen)))
 436         cache_res = func(test_string)
 437         cache_spec = [ord(c) for c in cache_res]
 438         expr_code = u' + '.join(gen_sig_code(cache_spec))
 439         code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code)
 440         self.to_screen(u'Extracted signature function:\n' + code)
 441
 442     def _parse_sig_js(self, jscode):
 443         funcname = self._search_regex(
 444             r'signature=([$a-zA-Z]+)', jscode,
 445              u'Initial JS player signature function name')
 446
 447         jsi = JSInterpreter(jscode)
 448         initial_function = jsi.extract_function(funcname)
 449         return lambda s: initial_function([s])
 450
 451     def _parse_sig_swf(self, file_contents):
 452         if file_contents[1:3] != b'WS':
 453             raise ExtractorError(
 454                 u'Not an SWF file; header is %r' % file_contents[:3])
 455         if file_contents[:1] == b'C':
 456             content = zlib.decompress(file_contents[8:])
 457         else:
 458             raise NotImplementedError(u'Unsupported compression format %r' %
 459                                       file_contents[:1])
 460
 461         def extract_tags(content):
 462             pos = 0
 463             while pos < len(content):
 464                 header16 = struct.unpack('<H', content[pos:pos+2])[0]
 465                 pos += 2
 466                 tag_code = header16 >> 6
 467                 tag_len = header16 & 0x3f
 468                 if tag_len == 0x3f:
 469                     tag_len = struct.unpack('<I', content[pos:pos+4])[0]
 470                     pos += 4
 471                 assert pos+tag_len <= len(content)
 472                 yield (tag_code, content[pos:pos+tag_len])
 473                 pos += tag_len
 474
 475         code_tag = next(tag
 476                         for tag_code, tag in extract_tags(content)
 477                         if tag_code == 82)
 478         p = code_tag.index(b'\0', 4) + 1
 479         code_reader = io.BytesIO(code_tag[p:])
 480
 481         # Parse ABC (AVM2 ByteCode)
 482         def read_int(reader=None):
 483             if reader is None:
 484                 reader = code_reader
 485             res = 0
 486             shift = 0
 487             for _ in range(5):
 488                 buf = reader.read(1)
 489                 assert len(buf) == 1
 490                 b = struct.unpack('<B', buf)[0]
 491                 res = res | ((b & 0x7f) << shift)
 492                 if b & 0x80 == 0:
 493                     break
 494                 shift += 7
 495             return res
 496
 497         def u30(reader=None):
 498             res = read_int(reader)
 499             assert res & 0xf0000000 == 0
 500             return res
 501         u32 = read_int
 502
 503         def s32(reader=None):
 504             v = read_int(reader)
 505             if v & 0x80000000 != 0:
 506                 v = - ((v ^ 0xffffffff) + 1)
 507             return v
 508
 509         def read_string(reader=None):
 510             if reader is None:
 511                 reader = code_reader
 512             slen = u30(reader)
 513             resb = reader.read(slen)
 514             assert len(resb) == slen
 515             return resb.decode('utf-8')
 516
 517         def read_bytes(count, reader=None):
 518             if reader is None:
 519                 reader = code_reader
 520             resb = reader.read(count)
 521             assert len(resb) == count
 522             return resb
 523
 524         def read_byte(reader=None):
 525             resb = read_bytes(1, reader=reader)
 526             res = struct.unpack('<B', resb)[0]
 527             return res
 528
 529         # minor_version + major_version
 530         read_bytes(2 + 2)
 531
 532         # Constant pool
 533         int_count = u30()
 534         for _c in range(1, int_count):
 535             s32()
 536         uint_count = u30()
 537         for _c in range(1, uint_count):
 538             u32()
 539         double_count = u30()
 540         read_bytes((double_count-1) * 8)
 541         string_count = u30()
 542         constant_strings = [u'']
 543         for _c in range(1, string_count):
 544             s = read_string()
 545             constant_strings.append(s)
 546         namespace_count = u30()
 547         for _c in range(1, namespace_count):
 548             read_bytes(1)  # kind
 549             u30()  # name
 550         ns_set_count = u30()
 551         for _c in range(1, ns_set_count):
 552             count = u30()
 553             for _c2 in range(count):
 554                 u30()
 555         multiname_count = u30()
 556         MULTINAME_SIZES = {
 557             0x07: 2,  # QName
 558             0x0d: 2,  # QNameA
 559             0x0f: 1,  # RTQName
 560             0x10: 1,  # RTQNameA
 561             0x11: 0,  # RTQNameL
 562             0x12: 0,  # RTQNameLA
 563             0x09: 2,  # Multiname
 564             0x0e: 2,  # MultinameA
 565             0x1b: 1,  # MultinameL
 566             0x1c: 1,  # MultinameLA
 567         }
 568         multinames = [u'']
 569         for _c in range(1, multiname_count):
 570             kind = u30()
 571             assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
 572             if kind == 0x07:
 573                 u30()  # namespace_idx
 574                 name_idx = u30()
 575                 multinames.append(constant_strings[name_idx])
 576             else:
 577                 multinames.append('[MULTINAME kind: %d]' % kind)
 578                 for _c2 in range(MULTINAME_SIZES[kind]):
 579                     u30()
 580
 581         # Methods
 582         method_count = u30()
 583         MethodInfo = collections.namedtuple(
 584             'MethodInfo',
 585             ['NEED_ARGUMENTS', 'NEED_REST'])
 586         method_infos = []
 587         for method_id in range(method_count):
 588             param_count = u30()
 589             u30()  # return type
 590             for _ in range(param_count):
 591                 u30()  # param type
 592             u30()  # name index (always 0 for youtube)
 593             flags = read_byte()
 594             if flags & 0x08 != 0:
 595                 # Options present
 596                 option_count = u30()
 597                 for c in range(option_count):
 598                     u30()  # val
 599                     read_bytes(1)  # kind
 600             if flags & 0x80 != 0:
 601                 # Param names present
 602                 for _ in range(param_count):
 603                     u30()  # param name
 604             mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
 605             method_infos.append(mi)
 606
 607         # Metadata
 608         metadata_count = u30()
 609         for _c in range(metadata_count):
 610             u30()  # name
 611             item_count = u30()
 612             for _c2 in range(item_count):
 613                 u30()  # key
 614                 u30()  # value
 615
 616         def parse_traits_info():
 617             trait_name_idx = u30()
 618             kind_full = read_byte()
 619             kind = kind_full & 0x0f
 620             attrs = kind_full >> 4
 621             methods = {}
 622             if kind in [0x00, 0x06]:  # Slot or Const
 623                 u30()  # Slot id
 624                 u30()  # type_name_idx
 625                 vindex = u30()
 626                 if vindex != 0:
 627                     read_byte()  # vkind
 628             elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter
 629                 u30()  # disp_id
 630                 method_idx = u30()
 631                 methods[multinames[trait_name_idx]] = method_idx
 632             elif kind == 0x04:  # Class
 633                 u30()  # slot_id
 634                 u30()  # classi
 635             elif kind == 0x05:  # Function
 636                 u30()  # slot_id
 637                 function_idx = u30()
 638                 methods[function_idx] = multinames[trait_name_idx]
 639             else:
 640                 raise ExtractorError(u'Unsupported trait kind %d' % kind)
 641
 642             if attrs & 0x4 != 0:  # Metadata present
 643                 metadata_count = u30()
 644                 for _c3 in range(metadata_count):
 645                     u30()  # metadata index
 646
 647             return methods
 648
 649         # Classes
 650         TARGET_CLASSNAME = u'SignatureDecipher'
 651         searched_idx = multinames.index(TARGET_CLASSNAME)
 652         searched_class_id = None
 653         class_count = u30()
 654         for class_id in range(class_count):
 655             name_idx = u30()
 656             if name_idx == searched_idx:
 657                 # We found the class we're looking for!
 658                 searched_class_id = class_id
 659             u30()  # super_name idx
 660             flags = read_byte()
 661             if flags & 0x08 != 0:  # Protected namespace is present
 662                 u30()  # protected_ns_idx
 663             intrf_count = u30()
 664             for _c2 in range(intrf_count):
 665                 u30()
 666             u30()  # iinit
 667             trait_count = u30()
 668             for _c2 in range(trait_count):
 669                 parse_traits_info()
 670
 671         if searched_class_id is None:
 672             raise ExtractorError(u'Target class %r not found' %
 673                                  TARGET_CLASSNAME)
 674
 675         method_names = {}
 676         method_idxs = {}
 677         for class_id in range(class_count):
 678             u30()  # cinit
 679             trait_count = u30()
 680             for _c2 in range(trait_count):
 681                 trait_methods = parse_traits_info()
 682                 if class_id == searched_class_id:
 683                     method_names.update(trait_methods.items())
 684                     method_idxs.update(dict(
 685                         (idx, name)
 686                         for name, idx in trait_methods.items()))
 687
 688         # Scripts
 689         script_count = u30()
 690         for _c in range(script_count):
 691             u30()  # init
 692             trait_count = u30()
 693             for _c2 in range(trait_count):
 694                 parse_traits_info()
 695
 696         # Method bodies
 697         method_body_count = u30()
 698         Method = collections.namedtuple('Method', ['code', 'local_count'])
 699         methods = {}
 700         for _c in range(method_body_count):
 701             method_idx = u30()
 702             u30()  # max_stack
 703             local_count = u30()
 704             u30()  # init_scope_depth
 705             u30()  # max_scope_depth
 706             code_length = u30()
 707             code = read_bytes(code_length)
 708             if method_idx in method_idxs:
 709                 m = Method(code, local_count)
 710                 methods[method_idxs[method_idx]] = m
 711             exception_count = u30()
 712             for _c2 in range(exception_count):
 713                 u30()  # from
 714                 u30()  # to
 715                 u30()  # target
 716                 u30()  # exc_type
 717                 u30()  # var_name
 718             trait_count = u30()
 719             for _c2 in range(trait_count):
 720                 parse_traits_info()
 721
 722         assert p + code_reader.tell() == len(code_tag)
 723         assert len(methods) == len(method_idxs)
 724
 725         method_pyfunctions = {}
 726
 727         def extract_function(func_name):
 728             if func_name in method_pyfunctions:
 729                 return method_pyfunctions[func_name]
 730             if func_name not in methods:
 731                 raise ExtractorError(u'Cannot find function %r' % func_name)
 732             m = methods[func_name]
 733
 734             def resfunc(args):
 735                 registers = ['(this)'] + list(args) + [None] * m.local_count
 736                 stack = []
 737                 coder = io.BytesIO(m.code)
 738                 while True:
 739                     opcode = struct.unpack('!B', coder.read(1))[0]
 740                     if opcode == 36:  # pushbyte
 741                         v = struct.unpack('!B', coder.read(1))[0]
 742                         stack.append(v)
 743                     elif opcode == 44:  # pushstring
 744                         idx = u30(coder)
 745                         stack.append(constant_strings[idx])
 746                     elif opcode == 48:  # pushscope
 747                         # We don't implement the scope register, so we'll just
 748                         # ignore the popped value
 749                         stack.pop()
 750                     elif opcode == 70:  # callproperty
 751                         index = u30(coder)
 752                         mname = multinames[index]
 753                         arg_count = u30(coder)
 754                         args = list(reversed(
 755                             [stack.pop() for _ in range(arg_count)]))
 756                         obj = stack.pop()
 757                         if mname == u'split':
 758                             assert len(args) == 1
 759                             assert isinstance(args[0], compat_str)
 760                             assert isinstance(obj, compat_str)
 761                             if args[0] == u'':
 762                                 res = list(obj)
 763                             else:
 764                                 res = obj.split(args[0])
 765                             stack.append(res)
 766                         elif mname == u'slice':
 767                             assert len(args) == 1
 768                             assert isinstance(args[0], int)
 769                             assert isinstance(obj, list)
 770                             res = obj[args[0]:]
 771                             stack.append(res)
 772                         elif mname == u'join':
 773                             assert len(args) == 1
 774                             assert isinstance(args[0], compat_str)
 775                             assert isinstance(obj, list)
 776                             res = args[0].join(obj)
 777                             stack.append(res)
 778                         elif mname in method_pyfunctions:
 779                             stack.append(method_pyfunctions[mname](args))
 780                         else:
 781                             raise NotImplementedError(
 782                                 u'Unsupported property %r on %r'
 783                                 % (mname, obj))
 784                     elif opcode == 72:  # returnvalue
 785                         res = stack.pop()
 786                         return res
 787                     elif opcode == 79:  # callpropvoid
 788                         index = u30(coder)
 789                         mname = multinames[index]
 790                         arg_count = u30(coder)
 791                         args = list(reversed(
 792                             [stack.pop() for _ in range(arg_count)]))
 793                         obj = stack.pop()
 794                         if mname == u'reverse':
 795                             assert isinstance(obj, list)
 796                             obj.reverse()
 797                         else:
 798                             raise NotImplementedError(
 799                                 u'Unsupported (void) property %r on %r'
 800                                 % (mname, obj))
 801                     elif opcode == 93:  # findpropstrict
 802                         index = u30(coder)
 803                         mname = multinames[index]
 804                         res = extract_function(mname)
 805                         stack.append(res)
 806                     elif opcode == 97:  # setproperty
 807                         index = u30(coder)
 808                         value = stack.pop()
 809                         idx = stack.pop()
 810                         obj = stack.pop()
 811                         assert isinstance(obj, list)
 812                         assert isinstance(idx, int)
 813                         obj[idx] = value
 814                     elif opcode == 98:  # getlocal
 815                         index = u30(coder)
 816                         stack.append(registers[index])
 817                     elif opcode == 99:  # setlocal
 818                         index = u30(coder)
 819                         value = stack.pop()
 820                         registers[index] = value
 821                     elif opcode == 102:  # getproperty
 822                         index = u30(coder)
 823                         pname = multinames[index]
 824                         if pname == u'length':
 825                             obj = stack.pop()
 826                             assert isinstance(obj, list)
 827                             stack.append(len(obj))
 828                         else:  # Assume attribute access
 829                             idx = stack.pop()
 830                             assert isinstance(idx, int)
 831                             obj = stack.pop()
 832                             assert isinstance(obj, list)
 833                             stack.append(obj[idx])
 834                     elif opcode == 128:  # coerce
 835                         u30(coder)
 836                     elif opcode == 133:  # coerce_s
 837                         assert isinstance(stack[-1], (type(None), compat_str))
 838                     elif opcode == 164:  # modulo
 839                         value2 = stack.pop()
 840                         value1 = stack.pop()
 841                         res = value1 % value2
 842                         stack.append(res)
 843                     elif opcode == 208:  # getlocal_0
 844                         stack.append(registers[0])
 845                     elif opcode == 209:  # getlocal_1
 846                         stack.append(registers[1])
 847                     elif opcode == 210:  # getlocal_2
 848                         stack.append(registers[2])
 849                     elif opcode == 211:  # getlocal_3
 850                         stack.append(registers[3])
 851                     elif opcode == 214:  # setlocal_2
 852                         registers[2] = stack.pop()
 853                     elif opcode == 215:  # setlocal_3
 854                         registers[3] = stack.pop()
 855                     else:
 856                         raise NotImplementedError(
 857                             u'Unsupported opcode %d' % opcode)
 858
 859             method_pyfunctions[func_name] = resfunc
 860             return resfunc
 861
 862         initial_function = extract_function(u'decipher')
 863         return lambda s: initial_function([s])
 864
 865     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 866         """Turn the encrypted s field into a working signature"""
 867
 868         if player_url is None:
 869             raise ExtractorError(u'Cannot decrypt signature without player_url')
 870
 871         if player_url.startswith(u'//'):
 872             player_url = u'https:' + player_url
 873         try:
 874             player_id = (player_url, len(s))
 875             if player_id not in self._player_cache:
 876                 func = self._extract_signature_function(
 877                     video_id, player_url, len(s)
 878                 )
 879                 self._player_cache[player_id] = func
 880             func = self._player_cache[player_id]
 881             if self._downloader.params.get('youtube_print_sig_code'):
 882                 self._print_sig_code(func, len(s))
 883             return func(s)
 884         except Exception as e:
 885             tb = traceback.format_exc()
 886             raise ExtractorError(
 887                 u'Automatic signature extraction failed: ' + tb, cause=e)
 888
 889     def _get_available_subtitles(self, video_id, webpage):
 890         try:
 891             sub_list = self._download_webpage(
 892                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 893                 video_id, note=False)
 894         except ExtractorError as err:
 895             self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 896             return {}
 897         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 898
 899         sub_lang_list = {}
 900         for l in lang_list:
 901             lang = l[1]
 902             params = compat_urllib_parse.urlencode({
 903                 'lang': lang,
 904                 'v': video_id,
 905                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 906                 'name': unescapeHTML(l[0]).encode('utf-8'),
 907             })
 908             url = u'https://www.youtube.com/api/timedtext?' + params
 909             sub_lang_list[lang] = url
 910         if not sub_lang_list:
 911             self._downloader.report_warning(u'video doesn\'t have subtitles')
 912             return {}
 913         return sub_lang_list
 914
 915     def _get_available_automatic_caption(self, video_id, webpage):
 916         """We need the webpage for getting the captions url, pass it as an
 917            argument to speed up the process."""
 918         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 919         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 920         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 921         err_msg = u'Couldn\'t find automatic captions for %s' % video_id
 922         if mobj is None:
 923             self._downloader.report_warning(err_msg)
 924             return {}
 925         player_config = json.loads(mobj.group(1))
 926         try:
 927             args = player_config[u'args']
 928             caption_url = args[u'ttsurl']
 929             timestamp = args[u'timestamp']
 930             # We get the available subtitles
 931             list_params = compat_urllib_parse.urlencode({
 932                 'type': 'list',
 933                 'tlangs': 1,
 934                 'asrs': 1,
 935             })
 936             list_url = caption_url + '&' + list_params
 937             caption_list = self._download_xml(list_url, video_id)
 938             original_lang_node = caption_list.find('track')
 939             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
 940                 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
 941                 return {}
 942             original_lang = original_lang_node.attrib['lang_code']
 943
 944             sub_lang_list = {}
 945             for lang_node in caption_list.findall('target'):
 946                 sub_lang = lang_node.attrib['lang_code']
 947                 params = compat_urllib_parse.urlencode({
 948                     'lang': original_lang,
 949                     'tlang': sub_lang,
 950                     'fmt': sub_format,
 951                     'ts': timestamp,
 952                     'kind': 'asr',
 953                 })
 954                 sub_lang_list[sub_lang] = caption_url + '&' + params
 955             return sub_lang_list
 956         # An extractor error can be raise by the download process if there are
 957         # no automatic captions but there are subtitles
 958         except (KeyError, ExtractorError):
 959             self._downloader.report_warning(err_msg)
 960             return {}
 961
 962     @classmethod
 963     def extract_id(cls, url):
 964         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 965         if mobj is None:
 966             raise ExtractorError(u'Invalid URL: %s' % url)
 967         video_id = mobj.group(2)
 968         return video_id
 969
 970     def _extract_from_m3u8(self, manifest_url, video_id):
 971         url_map = {}
 972         def _get_urls(_manifest):
 973             lines = _manifest.split('\n')
 974             urls = filter(lambda l: l and not l.startswith('#'),
 975                             lines)
 976             return urls
 977         manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
 978         formats_urls = _get_urls(manifest)
 979         for format_url in formats_urls:
 980             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 981             url_map[itag] = format_url
 982         return url_map
 983
 984     def _extract_annotations(self, video_id):
 985         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 986         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
 987
 988     def _real_extract(self, url):
 989         proto = (
 990             u'http' if self._downloader.params.get('prefer_insecure', False)
 991             else u'https')
 992
 993         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 994         mobj = re.search(self._NEXT_URL_RE, url)
 995         if mobj:
 996             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 997         video_id = self.extract_id(url)
 998
 999         # Get video webpage
1000         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1001         video_webpage = self._download_webpage(url, video_id)
1002
1003         # Attempt to extract SWF player URL
1004         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1005         if mobj is not None:
1006             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1007         else:
1008             player_url = None
1009
1010         # Get video info
1011         self.report_video_info_webpage_download(video_id)
1012         if re.search(r'player-age-gate-content">', video_webpage) is not None:
1013             self.report_age_confirmation()
1014             age_gate = True
1015             # We simulate the access to the video from www.youtube.com/v/{video_id}
1016             # this can be viewed without login into Youtube
1017             data = compat_urllib_parse.urlencode({'video_id': video_id,
1018                                                   'el': 'player_embedded',
1019                                                   'gl': 'US',
1020                                                   'hl': 'en',
1021                                                   'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1022                                                   'asv': 3,
1023                                                   'sts':'1588',
1024                                                   })
1025             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1026             video_info_webpage = self._download_webpage(video_info_url, video_id,
1027                                     note=False,
1028                                     errnote='unable to download video info webpage')
1029             video_info = compat_parse_qs(video_info_webpage)
1030         else:
1031             age_gate = False
1032             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1033                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1034                         % (video_id, el_type))
1035                 video_info_webpage = self._download_webpage(video_info_url, video_id,
1036                                         note=False,
1037                                         errnote='unable to download video info webpage')
1038                 video_info = compat_parse_qs(video_info_webpage)
1039                 if 'token' in video_info:
1040                     break
1041         if 'token' not in video_info:
1042             if 'reason' in video_info:
1043                 raise ExtractorError(
1044                     u'YouTube said: %s' % video_info['reason'][0],
1045                     expected=True, video_id=video_id)
1046             else:
1047                 raise ExtractorError(
1048                     u'"token" parameter not in video info for unknown reason',
1049                     video_id=video_id)
1050
1051         if 'view_count' in video_info:
1052             view_count = int(video_info['view_count'][0])
1053         else:
1054             view_count = None
1055
1056         # Check for "rental" videos
1057         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1058             raise ExtractorError(u'"rental" videos not supported')
1059
1060         # Start extracting information
1061         self.report_information_extraction(video_id)
1062
1063         # uploader
1064         if 'author' not in video_info:
1065             raise ExtractorError(u'Unable to extract uploader name')
1066         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1067
1068         # uploader_id
1069         video_uploader_id = None
1070         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1071         if mobj is not None:
1072             video_uploader_id = mobj.group(1)
1073         else:
1074             self._downloader.report_warning(u'unable to extract uploader nickname')
1075
1076         # title
1077         if 'title' in video_info:
1078             video_title = video_info['title'][0]
1079         else:
1080             self._downloader.report_warning(u'Unable to extract video title')
1081             video_title = u'_'
1082
1083         # thumbnail image
1084         # We try first to get a high quality image:
1085         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1086                             video_webpage, re.DOTALL)
1087         if m_thumb is not None:
1088             video_thumbnail = m_thumb.group(1)
1089         elif 'thumbnail_url' not in video_info:
1090             self._downloader.report_warning(u'unable to extract video thumbnail')
1091             video_thumbnail = None
1092         else:   # don't panic if we can't find it
1093             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1094
1095         # upload date
1096         upload_date = None
1097         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
1098         if mobj is None:
1099             mobj = re.search(
1100                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
1101                 video_webpage)
1102         if mobj is not None:
1103             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1104             upload_date = unified_strdate(upload_date)
1105
1106         m_cat_container = get_element_by_id("eow-category", video_webpage)
1107         if m_cat_container:
1108             category = self._html_search_regex(
1109                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1110                 default=None)
1111             video_categories = None if category is None else [category]
1112         else:
1113             video_categories = None
1114
1115         # description
1116         video_description = get_element_by_id("eow-description", video_webpage)
1117         if video_description:
1118             video_description = re.sub(r'''(?x)
1119                 <a\s+
1120                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1121                     title="([^"]+)"\s+
1122                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1123                     class="yt-uix-redirect-link"\s*>
1124                 [^<]+
1125                 </a>
1126             ''', r'\1', video_description)
1127             video_description = clean_html(video_description)
1128         else:
1129             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1130             if fd_mobj:
1131                 video_description = unescapeHTML(fd_mobj.group(1))
1132             else:
1133                 video_description = u''
1134
1135         def _extract_count(klass):
1136             count = self._search_regex(
1137                 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1138                 video_webpage, klass, default=None)
1139             if count is not None:
1140                 return int(count.replace(',', ''))
1141             return None
1142         like_count = _extract_count(u'likes-count')
1143         dislike_count = _extract_count(u'dislikes-count')
1144
1145         # subtitles
1146         video_subtitles = self.extract_subtitles(video_id, video_webpage)
1147
1148         if self._downloader.params.get('listsubtitles', False):
1149             self._list_available_subtitles(video_id, video_webpage)
1150             return
1151
1152         if 'length_seconds' not in video_info:
1153             self._downloader.report_warning(u'unable to extract video duration')
1154             video_duration = None
1155         else:
1156             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1157
1158         # annotations
1159         video_annotations = None
1160         if self._downloader.params.get('writeannotations', False):
1161                 video_annotations = self._extract_annotations(video_id)
1162
1163         # Decide which formats to download
1164         try:
1165             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1166             if not mobj:
1167                 raise ValueError('Could not find vevo ID')
1168             json_code = uppercase_escape(mobj.group(1))
1169             ytplayer_config = json.loads(json_code)
1170             args = ytplayer_config['args']
1171             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1172             # this signatures are encrypted
1173             if 'url_encoded_fmt_stream_map' not in args:
1174                 raise ValueError(u'No stream_map present')  # caught below
1175             re_signature = re.compile(r'[&,]s=')
1176             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1177             if m_s is not None:
1178                 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1179                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1180             m_s = re_signature.search(args.get('adaptive_fmts', u''))
1181             if m_s is not None:
1182                 if 'adaptive_fmts' in video_info:
1183                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1184                 else:
1185                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1186         except ValueError:
1187             pass
1188
1189         def _map_to_format_list(urlmap):
1190             formats = []
1191             for itag, video_real_url in urlmap.items():
1192                 dct = {
1193                     'format_id': itag,
1194                     'url': video_real_url,
1195                     'player_url': player_url,
1196                 }
1197                 if itag in self._formats:
1198                     dct.update(self._formats[itag])
1199                 formats.append(dct)
1200             return formats
1201
1202         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1203             self.report_rtmp_download()
1204             formats = [{
1205                 'format_id': '_rtmp',
1206                 'protocol': 'rtmp',
1207                 'url': video_info['conn'][0],
1208                 'player_url': player_url,
1209             }]
1210         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1211             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1212             if 'rtmpe%3Dyes' in encoded_url_map:
1213                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1214             url_map = {}
1215             for url_data_str in encoded_url_map.split(','):
1216                 url_data = compat_parse_qs(url_data_str)
1217                 if 'itag' in url_data and 'url' in url_data:
1218                     url = url_data['url'][0]
1219                     if 'sig' in url_data:
1220                         url += '&signature=' + url_data['sig'][0]
1221                     elif 's' in url_data:
1222                         encrypted_sig = url_data['s'][0]
1223                         if self._downloader.params.get('verbose'):
1224                             if age_gate:
1225                                 if player_url is None:
1226                                     player_version = 'unknown'
1227                                 else:
1228                                     player_version = self._search_regex(
1229                                         r'-(.+)\.swf$', player_url,
1230                                         u'flash player', fatal=False)
1231                                 player_desc = 'flash player %s' % player_version
1232                             else:
1233                                 player_version = self._search_regex(
1234                                     r'html5player-(.+?)\.js', video_webpage,
1235                                     'html5 player', fatal=False)
1236                                 player_desc = u'html5 player %s' % player_version
1237
1238                             parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1239                             self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1240                                 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1241
1242                         if not age_gate:
1243                             jsplayer_url_json = self._search_regex(
1244                                 r'"assets":.+?"js":\s*("[^"]+")',
1245                                 video_webpage, u'JS player URL')
1246                             player_url = json.loads(jsplayer_url_json)
1247
1248                         signature = self._decrypt_signature(
1249                             encrypted_sig, video_id, player_url, age_gate)
1250                         url += '&signature=' + signature
1251                     if 'ratebypass' not in url:
1252                         url += '&ratebypass=yes'
1253                     url_map[url_data['itag'][0]] = url
1254             formats = _map_to_format_list(url_map)
1255         elif video_info.get('hlsvp'):
1256             manifest_url = video_info['hlsvp'][0]
1257             url_map = self._extract_from_m3u8(manifest_url, video_id)
1258             formats = _map_to_format_list(url_map)
1259         else:
1260             raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1261
1262         # Look for the DASH manifest
1263         if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1264             try:
1265                 # The DASH manifest used needs to be the one from the original video_webpage.
1266                 # The one found in get_video_info seems to be using different signatures.
1267                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1268                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1269                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1270                 if age_gate:
1271                     dash_manifest_url = video_info.get('dashmpd')[0]
1272                 else:
1273                     dash_manifest_url = ytplayer_config['args']['dashmpd']
1274                 def decrypt_sig(mobj):
1275                     s = mobj.group(1)
1276                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1277                     return '/signature/%s' % dec_s
1278                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1279                 dash_doc = self._download_xml(
1280                     dash_manifest_url, video_id,
1281                     note=u'Downloading DASH manifest',
1282                     errnote=u'Could not download DASH manifest')
1283                 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1284                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1285                     if url_el is None:
1286                         continue
1287                     format_id = r.attrib['id']
1288                     video_url = url_el.text
1289                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1290                     f = {
1291                         'format_id': format_id,
1292                         'url': video_url,
1293                         'width': int_or_none(r.attrib.get('width')),
1294                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1295                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1296                         'filesize': filesize,
1297                     }
1298                     try:
1299                         existing_format = next(
1300                             fo for fo in formats
1301                             if fo['format_id'] == format_id)
1302                     except StopIteration:
1303                         f.update(self._formats.get(format_id, {}))
1304                         formats.append(f)
1305                     else:
1306                         existing_format.update(f)
1307
1308             except (ExtractorError, KeyError) as e:
1309                 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1310
1311         self._sort_formats(formats)
1312
1313         return {
1314             'id':           video_id,
1315             'uploader':     video_uploader,
1316             'uploader_id':  video_uploader_id,
1317             'upload_date':  upload_date,
1318             'title':        video_title,
1319             'thumbnail':    video_thumbnail,
1320             'description':  video_description,
1321             'categories':   video_categories,
1322             'subtitles':    video_subtitles,
1323             'duration':     video_duration,
1324             'age_limit':    18 if age_gate else 0,
1325             'annotations':  video_annotations,
1326             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1327             'view_count':   view_count,
1328             'like_count': like_count,
1329             'dislike_count': dislike_count,
1330             'formats':      formats,
1331         }
1332
1333 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1334     IE_DESC = u'YouTube.com playlists'
1335     _VALID_URL = r"""(?x)(?:
1336                         (?:https?://)?
1337                         (?:\w+\.)?
1338                         youtube\.com/
1339                         (?:
1340                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1341                            \? (?:.*?&)*? (?:p|a|list)=
1342                         |  p/
1343                         )
1344                         (
1345                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1346                             # Top tracks, they can also include dots
1347                             |(?:MC)[\w\.]*
1348                         )
1349                         .*
1350                      |
1351                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1352                      )"""
1353     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1354     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1355     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1356     IE_NAME = u'youtube:playlist'
1357
1358     def _real_initialize(self):
1359         self._login()
1360
1361     def _ids_to_results(self, ids):
1362         return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1363                        for vid_id in ids]
1364
1365     def _extract_mix(self, playlist_id):
1366         # The mixes are generated from a a single video
1367         # the id of the playlist is just 'RD' + video_id
1368         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1369         webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1370         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1371         title_span = (search_title('playlist-title') or
1372             search_title('title long-title') or search_title('title'))
1373         title = clean_html(title_span)
1374         video_re = r'''(?x)data-video-username=".*?".*?
1375                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1376         ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1377         url_results = self._ids_to_results(ids)
1378
1379         return self.playlist_result(url_results, playlist_id, title)
1380
1381     def _real_extract(self, url):
1382         # Extract playlist id
1383         mobj = re.match(self._VALID_URL, url)
1384         if mobj is None:
1385             raise ExtractorError(u'Invalid URL: %s' % url)
1386         playlist_id = mobj.group(1) or mobj.group(2)
1387
1388         # Check if it's a video-specific URL
1389         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1390         if 'v' in query_dict:
1391             video_id = query_dict['v'][0]
1392             if self._downloader.params.get('noplaylist'):
1393                 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1394                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1395             else:
1396                 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1397
1398         if playlist_id.startswith('RD'):
1399             # Mixes require a custom extraction process
1400             return self._extract_mix(playlist_id)
1401         if playlist_id.startswith('TL'):
1402             raise ExtractorError(u'For downloading YouTube.com top lists, use '
1403                 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1404
1405         url = self._TEMPLATE_URL % playlist_id
1406         page = self._download_webpage(url, playlist_id)
1407         more_widget_html = content_html = page
1408
1409         # Check if the playlist exists or is private
1410         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1411             raise ExtractorError(
1412                 u'The playlist doesn\'t exist or is private, use --username or '
1413                 '--netrc to access it.',
1414                 expected=True)
1415
1416         # Extract the video ids from the playlist pages
1417         ids = []
1418
1419         for page_num in itertools.count(1):
1420             matches = re.finditer(self._VIDEO_RE, content_html)
1421             # We remove the duplicates and the link with index 0
1422             # (it's not the first video of the playlist)
1423             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1424             ids.extend(new_ids)
1425
1426             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1427             if not mobj:
1428                 break
1429
1430             more = self._download_json(
1431                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1432                 'Downloading page #%s' % page_num,
1433                 transform_source=uppercase_escape)
1434             content_html = more['content_html']
1435             more_widget_html = more['load_more_widget_html']
1436
1437         playlist_title = self._html_search_regex(
1438             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1439             page, u'title')
1440
1441         url_results = self._ids_to_results(ids)
1442         return self.playlist_result(url_results, playlist_id, playlist_title)
1443
1444
1445 class YoutubeTopListIE(YoutubePlaylistIE):
1446     IE_NAME = u'youtube:toplist'
1447     IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1448         u' (Example: "yttoplist:music:Top Tracks")')
1449     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1450
1451     def _real_extract(self, url):
1452         mobj = re.match(self._VALID_URL, url)
1453         channel = mobj.group('chann')
1454         title = mobj.group('title')
1455         query = compat_urllib_parse.urlencode({'title': title})
1456         playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1457         channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1458         link = self._html_search_regex(playlist_re, channel_page, u'list')
1459         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1460
1461         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1462         ids = []
1463         # sometimes the webpage doesn't contain the videos
1464         # retry until we get them
1465         for i in itertools.count(0):
1466             msg = u'Downloading Youtube mix'
1467             if i > 0:
1468                 msg += ', retry #%d' % i
1469             webpage = self._download_webpage(url, title, msg)
1470             ids = orderedSet(re.findall(video_re, webpage))
1471             if ids:
1472                 break
1473         url_results = self._ids_to_results(ids)
1474         return self.playlist_result(url_results, playlist_title=title)
1475
1476
1477 class YoutubeChannelIE(InfoExtractor):
1478     IE_DESC = u'YouTube.com channels'
1479     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1480     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1481     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1482     IE_NAME = u'youtube:channel'
1483
1484     def extract_videos_from_page(self, page):
1485         ids_in_page = []
1486         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1487             if mobj.group(1) not in ids_in_page:
1488                 ids_in_page.append(mobj.group(1))
1489         return ids_in_page
1490
1491     def _real_extract(self, url):
1492         # Extract channel id
1493         mobj = re.match(self._VALID_URL, url)
1494         if mobj is None:
1495             raise ExtractorError(u'Invalid URL: %s' % url)
1496
1497         # Download channel page
1498         channel_id = mobj.group(1)
1499         video_ids = []
1500         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1501         channel_page = self._download_webpage(url, channel_id)
1502         autogenerated = re.search(r'''(?x)
1503                 class="[^"]*?(?:
1504                     channel-header-autogenerated-label|
1505                     yt-channel-title-autogenerated
1506                 )[^"]*"''', channel_page) is not None
1507
1508         if autogenerated:
1509             # The videos are contained in a single page
1510             # the ajax pages can't be used, they are empty
1511             video_ids = self.extract_videos_from_page(channel_page)
1512         else:
1513             # Download all channel pages using the json-based channel_ajax query
1514             for pagenum in itertools.count(1):
1515                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1516                 page = self._download_json(
1517                     url, channel_id, note=u'Downloading page #%s' % pagenum,
1518                     transform_source=uppercase_escape)
1519
1520                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1521                 video_ids.extend(ids_in_page)
1522
1523                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1524                     break
1525
1526         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1527
1528         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1529                        for video_id in video_ids]
1530         return self.playlist_result(url_entries, channel_id)
1531
1532
1533 class YoutubeUserIE(InfoExtractor):
1534     IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1535     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1536     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1537     _GDATA_PAGE_SIZE = 50
1538     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1539     IE_NAME = u'youtube:user'
1540
1541     @classmethod
1542     def suitable(cls, url):
1543         # Don't return True if the url can be extracted with other youtube
1544         # extractor, the regex would is too permissive and it would match.
1545         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1546         if any(ie.suitable(url) for ie in other_ies): return False
1547         else: return super(YoutubeUserIE, cls).suitable(url)
1548
1549     def _real_extract(self, url):
1550         # Extract username
1551         mobj = re.match(self._VALID_URL, url)
1552         if mobj is None:
1553             raise ExtractorError(u'Invalid URL: %s' % url)
1554
1555         username = mobj.group(1)
1556
1557         # Download video ids using YouTube Data API. Result size per
1558         # query is limited (currently to 50 videos) so we need to query
1559         # page by page until there are no video ids - it means we got
1560         # all of them.
1561
1562         def download_page(pagenum):
1563             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1564
1565             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1566             page = self._download_webpage(
1567                 gdata_url, username,
1568                 u'Downloading video ids from %d to %d' % (
1569                     start_index, start_index + self._GDATA_PAGE_SIZE))
1570
1571             try:
1572                 response = json.loads(page)
1573             except ValueError as err:
1574                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1575             if 'entry' not in response['feed']:
1576                 return
1577
1578             # Extract video identifiers
1579             entries = response['feed']['entry']
1580             for entry in entries:
1581                 title = entry['title']['$t']
1582                 video_id = entry['id']['$t'].split('/')[-1]
1583                 yield {
1584                     '_type': 'url',
1585                     'url': video_id,
1586                     'ie_key': 'Youtube',
1587                     'id': video_id,
1588                     'title': title,
1589                 }
1590         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1591
1592         return self.playlist_result(url_results, playlist_title=username)
1593
1594
1595 class YoutubeSearchIE(SearchInfoExtractor):
1596     IE_DESC = u'YouTube.com searches'
1597     _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1598     _MAX_RESULTS = 1000
1599     IE_NAME = u'youtube:search'
1600     _SEARCH_KEY = 'ytsearch'
1601
1602     def _get_n_results(self, query, n):
1603         """Get a specified number of results for a query"""
1604
1605         video_ids = []
1606         pagenum = 0
1607         limit = n
1608         PAGE_SIZE = 50
1609
1610         while (PAGE_SIZE * pagenum) < limit:
1611             result_url = self._API_URL % (
1612                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1613                 (PAGE_SIZE * pagenum) + 1)
1614             data_json = self._download_webpage(
1615                 result_url, video_id=u'query "%s"' % query,
1616                 note=u'Downloading page %s' % (pagenum + 1),
1617                 errnote=u'Unable to download API page')
1618             data = json.loads(data_json)
1619             api_response = data['data']
1620
1621             if 'items' not in api_response:
1622                 raise ExtractorError(
1623                     u'[youtube] No video results', expected=True)
1624
1625             new_ids = list(video['id'] for video in api_response['items'])
1626             video_ids += new_ids
1627
1628             limit = min(n, api_response['totalItems'])
1629             pagenum += 1
1630
1631         if len(video_ids) > n:
1632             video_ids = video_ids[:n]
1633         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1634                   for video_id in video_ids]
1635         return self.playlist_result(videos, query)
1636
1637
1638 class YoutubeSearchDateIE(YoutubeSearchIE):
1639     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1640     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1641     _SEARCH_KEY = 'ytsearchdate'
1642     IE_DESC = u'YouTube.com searches, newest videos first'
1643
1644
1645 class YoutubeSearchURLIE(InfoExtractor):
1646     IE_DESC = u'YouTube.com search URLs'
1647     IE_NAME = u'youtube:search_url'
1648     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1649
1650     def _real_extract(self, url):
1651         mobj = re.match(self._VALID_URL, url)
1652         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1653
1654         webpage = self._download_webpage(url, query)
1655         result_code = self._search_regex(
1656             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
1657
1658         part_codes = re.findall(
1659             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1660         entries = []
1661         for part_code in part_codes:
1662             part_title = self._html_search_regex(
1663                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1664             part_url_snippet = self._html_search_regex(
1665                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1666             part_url = compat_urlparse.urljoin(
1667                 'https://www.youtube.com/', part_url_snippet)
1668             entries.append({
1669                 '_type': 'url',
1670                 'url': part_url,
1671                 'title': part_title,
1672             })
1673
1674         return {
1675             '_type': 'playlist',
1676             'entries': entries,
1677             'title': query,
1678         }
1679
1680
1681 class YoutubeShowIE(InfoExtractor):
1682     IE_DESC = u'YouTube.com (multi-season) shows'
1683     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1684     IE_NAME = u'youtube:show'
1685
1686     def _real_extract(self, url):
1687         mobj = re.match(self._VALID_URL, url)
1688         show_name = mobj.group(1)
1689         webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1690         # There's one playlist for each season of the show
1691         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1692         self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1693         return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1694
1695
1696 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1697     """
1698     Base class for extractors that fetch info from
1699     http://www.youtube.com/feed_ajax
1700     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1701     """
1702     _LOGIN_REQUIRED = True
1703     # use action_load_personal_feed instead of action_load_system_feed
1704     _PERSONAL_FEED = False
1705
1706     @property
1707     def _FEED_TEMPLATE(self):
1708         action = 'action_load_system_feed'
1709         if self._PERSONAL_FEED:
1710             action = 'action_load_personal_feed'
1711         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1712
1713     @property
1714     def IE_NAME(self):
1715         return u'youtube:%s' % self._FEED_NAME
1716
1717     def _real_initialize(self):
1718         self._login()
1719
1720     def _real_extract(self, url):
1721         feed_entries = []
1722         paging = 0
1723         for i in itertools.count(1):
1724             info = self._download_json(self._FEED_TEMPLATE % paging,
1725                                           u'%s feed' % self._FEED_NAME,
1726                                           u'Downloading page %s' % i)
1727             feed_html = info.get('feed_html') or info.get('content_html')
1728             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1729             ids = orderedSet(m.group(1) for m in m_ids)
1730             feed_entries.extend(
1731                 self.url_result(video_id, 'Youtube', video_id=video_id)
1732                 for video_id in ids)
1733             mobj = re.search(
1734                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1735                 feed_html)
1736             if mobj is None:
1737                 break
1738             paging = mobj.group('paging')
1739         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1740
1741 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1742     IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1743     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1744     _FEED_NAME = 'subscriptions'
1745     _PLAYLIST_TITLE = u'Youtube Subscriptions'
1746
1747 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1748     IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1749     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1750     _FEED_NAME = 'recommended'
1751     _PLAYLIST_TITLE = u'Youtube Recommended videos'
1752
1753 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1754     IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1755     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1756     _FEED_NAME = 'watch_later'
1757     _PLAYLIST_TITLE = u'Youtube Watch Later'
1758     _PERSONAL_FEED = True
1759
1760 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1761     IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1762     _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1763     _FEED_NAME = 'history'
1764     _PERSONAL_FEED = True
1765     _PLAYLIST_TITLE = u'Youtube Watch History'
1766
1767 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1768     IE_NAME = u'youtube:favorites'
1769     IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1770     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1771     _LOGIN_REQUIRED = True
1772
1773     def _real_extract(self, url):
1774         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1775         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1776         return self.url_result(playlist_id, 'YoutubePlaylist')
1777
1778
1779 class YoutubeTruncatedURLIE(InfoExtractor):
1780     IE_NAME = 'youtube:truncated_url'
1781     IE_DESC = False  # Do not list
1782     _VALID_URL = r'''(?x)
1783         (?:https?://)?[^/]+/watch\?(?:
1784             feature=[a-z_]+|
1785             annotation_id=annotation_[^&]+
1786         )?$|
1787         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1788     '''
1789
1790     _TESTS = [{
1791         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1792         'only_matching': True,
1793     }, {
1794         'url': 'http://www.youtube.com/watch?',
1795         'only_matching': True,
1796     }]
1797
1798     def _real_extract(self, url):
1799         raise ExtractorError(
1800             u'Did you forget to quote the URL? Remember that & is a meta '
1801             u'character in most shells, so you want to put the URL in quotes, '
1802             u'like  youtube-dl '
1803             u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1804             u' or simply  youtube-dl BaW_jenozKc  .',
1805             expected=True)