_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 import collections
   4 import errno
   5 import io
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import struct
  11 import traceback
  12 import zlib
  13
  14 from .common import InfoExtractor, SearchInfoExtractor
  15 from .subtitles import SubtitlesInfoExtractor
  16 from ..jsinterp import JSInterpreter
  17 from ..utils import (
  18     compat_chr,
  19     compat_parse_qs,
  20     compat_urllib_parse,
  21     compat_urllib_request,
  22     compat_urlparse,
  23     compat_str,
  24
  25     clean_html,
  26     get_cachedir,
  27     get_element_by_id,
  28     get_element_by_attribute,
  29     ExtractorError,
  30     int_or_none,
  31     PagedList,
  32     unescapeHTML,
  33     unified_strdate,
  34     orderedSet,
  35     write_json_file,
  36     uppercase_escape,
  37 )
  38
  39 class YoutubeBaseInfoExtractor(InfoExtractor):
  40     """Provide base functions for Youtube extractors"""
  41     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  42     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  43     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  44     _NETRC_MACHINE = 'youtube'
  45     # If True it will raise an error if no login info is provided
  46     _LOGIN_REQUIRED = False
  47
  48     def _set_language(self):
  49         return bool(self._download_webpage(
  50             self._LANG_URL, None,
  51             note=u'Setting language', errnote='unable to set language',
  52             fatal=False))
  53
  54     def _login(self):
  55         (username, password) = self._get_login_info()
  56         # No authentication to be performed
  57         if username is None:
  58             if self._LOGIN_REQUIRED:
  59                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  60             return False
  61
  62         login_page = self._download_webpage(
  63             self._LOGIN_URL, None,
  64             note=u'Downloading login page',
  65             errnote=u'unable to fetch login page', fatal=False)
  66         if login_page is False:
  67             return
  68
  69         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  70                                   login_page, u'Login GALX parameter')
  71
  72         # Log in
  73         login_form_strs = {
  74                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  75                 u'Email': username,
  76                 u'GALX': galx,
  77                 u'Passwd': password,
  78                 u'PersistentCookie': u'yes',
  79                 u'_utf8': u'霱',
  80                 u'bgresponse': u'js_disabled',
  81                 u'checkConnection': u'',
  82                 u'checkedDomains': u'youtube',
  83                 u'dnConn': u'',
  84                 u'pstMsg': u'0',
  85                 u'rmShown': u'1',
  86                 u'secTok': u'',
  87                 u'signIn': u'Sign in',
  88                 u'timeStmp': u'',
  89                 u'service': u'youtube',
  90                 u'uilel': u'3',
  91                 u'hl': u'en_US',
  92         }
  93         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
  94         # chokes on unicode
  95         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
  96         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
  97
  98         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
  99         login_results = self._download_webpage(
 100             req, None,
 101             note=u'Logging in', errnote=u'unable to log in', fatal=False)
 102         if login_results is False:
 103             return False
 104         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 105             self._downloader.report_warning(u'unable to log in: bad username or password')
 106             return False
 107         return True
 108
 109     def _confirm_age(self):
 110         age_form = {
 111             'next_url': '/',
 112             'action_confirm': 'Confirm',
 113         }
 114         req = compat_urllib_request.Request(self._AGE_URL,
 115             compat_urllib_parse.urlencode(age_form).encode('ascii'))
 116
 117         self._download_webpage(
 118             req, None,
 119             note=u'Confirming age', errnote=u'Unable to confirm age')
 120         return True
 121
 122     def _real_initialize(self):
 123         if self._downloader is None:
 124             return
 125         if not self._set_language():
 126             return
 127         if not self._login():
 128             return
 129         self._confirm_age()
 130
 131
 132 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 133     IE_DESC = u'YouTube.com'
 134     _VALID_URL = r"""(?x)^
 135                      (
 136                          (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
 137                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 138                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 139                             (?:www\.)?pwnyoutube\.com/|
 140                             (?:www\.)?yourepeat\.com/|
 141                             tube\.majestyc\.net/|
 142                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 143                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 144                          (?:                                                  # the various things that can precede the ID:
 145                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 146                              |(?:                                             # or the v= param in all its forms
 147                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 148                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 149                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 150                                  v=
 151                              )
 152                          ))
 153                          |youtu\.be/                                          # just youtu.be/xxxx
 154                          |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 155                          )
 156                      )?                                                       # all until now is optional -> you can pass the naked ID
 157                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 158                      (?(1).+)?                                                # if we found the ID, everything can follow
 159                      $"""
 160     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 161     _formats = {
 162         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 163         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 164         '13': {'ext': '3gp'},
 165         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 166         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 167         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 168         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 169         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 170         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 171         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 172         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 173         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 174         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 175         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 176         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 177
 178
 179         # 3d videos
 180         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 181         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 182         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 183         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 184         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 185         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 186         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 187
 188         # Apple HTTP Live Streaming
 189         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 190         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 191         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 192         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 193         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 194         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 195         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 196
 197         # DASH mp4 video
 198         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 199         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 200         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 201         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 202         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 203         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 204         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 205         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 206
 207         # Dash mp4 audio
 208         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 209         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 210         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 211
 212         # Dash webm
 213         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 214         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 215         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 216         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 217         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 218         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 219         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 220         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 221         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 222         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 223         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 224         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 225         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 226         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 227         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 228
 229         # Dash webm audio
 230         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
 231         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 232
 233         # RTMP (unnamed)
 234         '_rtmp': {'protocol': 'rtmp'},
 235     }
 236
 237     IE_NAME = u'youtube'
 238     _TESTS = [
 239         {
 240             u"url":  u"http://www.youtube.com/watch?v=BaW_jenozKc",
 241             u"file":  u"BaW_jenozKc.mp4",
 242             u"info_dict": {
 243                 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
 244                 u"uploader": u"Philipp Hagemeister",
 245                 u"uploader_id": u"phihag",
 246                 u"upload_date": u"20121002",
 247                 u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
 248                 u"categories": [u'Science & Technology'],
 249             }
 250         },
 251         {
 252             u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
 253             u"file":  u"UxxajLWwzqY.mp4",
 254             u"note": u"Test generic use_cipher_signature video (#897)",
 255             u"info_dict": {
 256                 u"upload_date": u"20120506",
 257                 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
 258                 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
 259                 u"uploader": u"Icona Pop",
 260                 u"uploader_id": u"IconaPop"
 261             }
 262         },
 263         {
 264             u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
 265             u"file":  u"07FYdnEawAQ.mp4",
 266             u"note": u"Test VEVO video with age protection (#956)",
 267             u"info_dict": {
 268                 u"upload_date": u"20130703",
 269                 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
 270                 u"description": u"md5:64249768eec3bc4276236606ea996373",
 271                 u"uploader": u"justintimberlakeVEVO",
 272                 u"uploader_id": u"justintimberlakeVEVO"
 273             }
 274         },
 275         {
 276             u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
 277             u"file":  u"yZIXLfi8CZQ.mp4",
 278             u"note": u"Embed-only video (#1746)",
 279             u"info_dict": {
 280                 u"upload_date": u"20120608",
 281                 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
 282                 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
 283                 u"uploader": u"SET India",
 284                 u"uploader_id": u"setindia"
 285             }
 286         },
 287         {
 288             u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
 289             u"file": u"a9LDPn-MO4I.m4a",
 290             u"note": u"256k DASH audio (format 141) via DASH manifest",
 291             u"info_dict": {
 292                 u"upload_date": "20121002",
 293                 u"uploader_id": "8KVIDEO",
 294                 u"description": "No description available.",
 295                 u"uploader": "8KVIDEO",
 296                 u"title": "UHDTV TEST 8K VIDEO.mp4"
 297             },
 298             u"params": {
 299                 u"youtube_include_dash_manifest": True,
 300                 u"format": "141",
 301             },
 302         },
 303         # DASH manifest with encrypted signature
 304         {
 305             u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 306             u'info_dict': {
 307                 u'id': u'IB3lcPjvWLA',
 308                 u'ext': u'm4a',
 309                 u'title': u'Afrojack - The Spark ft. Spree Wilson',
 310                 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
 311                 u'uploader': u'AfrojackVEVO',
 312                 u'uploader_id': u'AfrojackVEVO',
 313                 u'upload_date': u'20131011',
 314             },
 315             u"params": {
 316                 u'youtube_include_dash_manifest': True,
 317                 u'format': '141',
 318             },
 319         },
 320     ]
 321
 322
 323     @classmethod
 324     def suitable(cls, url):
 325         """Receives a URL and returns True if suitable for this IE."""
 326         if YoutubePlaylistIE.suitable(url): return False
 327         return re.match(cls._VALID_URL, url) is not None
 328
 329     def __init__(self, *args, **kwargs):
 330         super(YoutubeIE, self).__init__(*args, **kwargs)
 331         self._player_cache = {}
 332
 333     def report_video_info_webpage_download(self, video_id):
 334         """Report attempt to download video info webpage."""
 335         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 336
 337     def report_information_extraction(self, video_id):
 338         """Report attempt to extract video information."""
 339         self.to_screen(u'%s: Extracting video information' % video_id)
 340
 341     def report_unavailable_format(self, video_id, format):
 342         """Report extracted video URL."""
 343         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 344
 345     def report_rtmp_download(self):
 346         """Indicate the download will use the RTMP protocol."""
 347         self.to_screen(u'RTMP download detected')
 348
 349     def _extract_signature_function(self, video_id, player_url, slen):
 350         id_m = re.match(
 351             r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3)?\.(?P<ext>[a-z]+)$',
 352             player_url)
 353         player_type = id_m.group('ext')
 354         player_id = id_m.group('id')
 355
 356         # Read from filesystem cache
 357         func_id = '%s_%s_%d' % (player_type, player_id, slen)
 358         assert os.path.basename(func_id) == func_id
 359         cache_dir = get_cachedir(self._downloader.params)
 360
 361         cache_enabled = cache_dir is not None
 362         if cache_enabled:
 363             cache_fn = os.path.join(os.path.expanduser(cache_dir),
 364                                     u'youtube-sigfuncs',
 365                                     func_id + '.json')
 366             try:
 367                 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
 368                     cache_spec = json.load(cachef)
 369                 return lambda s: u''.join(s[i] for i in cache_spec)
 370             except IOError:
 371                 pass  # No cache available
 372
 373         if player_type == 'js':
 374             code = self._download_webpage(
 375                 player_url, video_id,
 376                 note=u'Downloading %s player %s' % (player_type, player_id),
 377                 errnote=u'Download of %s failed' % player_url)
 378             res = self._parse_sig_js(code)
 379         elif player_type == 'swf':
 380             urlh = self._request_webpage(
 381                 player_url, video_id,
 382                 note=u'Downloading %s player %s' % (player_type, player_id),
 383                 errnote=u'Download of %s failed' % player_url)
 384             code = urlh.read()
 385             res = self._parse_sig_swf(code)
 386         else:
 387             assert False, 'Invalid player type %r' % player_type
 388
 389         if cache_enabled:
 390             try:
 391                 test_string = u''.join(map(compat_chr, range(slen)))
 392                 cache_res = res(test_string)
 393                 cache_spec = [ord(c) for c in cache_res]
 394                 try:
 395                     os.makedirs(os.path.dirname(cache_fn))
 396                 except OSError as ose:
 397                     if ose.errno != errno.EEXIST:
 398                         raise
 399                 write_json_file(cache_spec, cache_fn)
 400             except Exception:
 401                 tb = traceback.format_exc()
 402                 self._downloader.report_warning(
 403                     u'Writing cache to %r failed: %s' % (cache_fn, tb))
 404
 405         return res
 406
 407     def _print_sig_code(self, func, slen):
 408         def gen_sig_code(idxs):
 409             def _genslice(start, end, step):
 410                 starts = u'' if start == 0 else str(start)
 411                 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
 412                 steps = u'' if step == 1 else (u':%d' % step)
 413                 return u's[%s%s%s]' % (starts, ends, steps)
 414
 415             step = None
 416             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 417                                     # set as soon as step is set
 418             for i, prev in zip(idxs[1:], idxs[:-1]):
 419                 if step is not None:
 420                     if i - prev == step:
 421                         continue
 422                     yield _genslice(start, prev, step)
 423                     step = None
 424                     continue
 425                 if i - prev in [-1, 1]:
 426                     step = i - prev
 427                     start = prev
 428                     continue
 429                 else:
 430                     yield u's[%d]' % prev
 431             if step is None:
 432                 yield u's[%d]' % i
 433             else:
 434                 yield _genslice(start, i, step)
 435
 436         test_string = u''.join(map(compat_chr, range(slen)))
 437         cache_res = func(test_string)
 438         cache_spec = [ord(c) for c in cache_res]
 439         expr_code = u' + '.join(gen_sig_code(cache_spec))
 440         code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code)
 441         self.to_screen(u'Extracted signature function:\n' + code)
 442
 443     def _parse_sig_js(self, jscode):
 444         funcname = self._search_regex(
 445             r'signature=([$a-zA-Z]+)', jscode,
 446              u'Initial JS player signature function name')
 447
 448         jsi = JSInterpreter(jscode)
 449         initial_function = jsi.extract_function(funcname)
 450         return lambda s: initial_function([s])
 451
 452     def _parse_sig_swf(self, file_contents):
 453         if file_contents[1:3] != b'WS':
 454             raise ExtractorError(
 455                 u'Not an SWF file; header is %r' % file_contents[:3])
 456         if file_contents[:1] == b'C':
 457             content = zlib.decompress(file_contents[8:])
 458         else:
 459             raise NotImplementedError(u'Unsupported compression format %r' %
 460                                       file_contents[:1])
 461
 462         def extract_tags(content):
 463             pos = 0
 464             while pos < len(content):
 465                 header16 = struct.unpack('<H', content[pos:pos+2])[0]
 466                 pos += 2
 467                 tag_code = header16 >> 6
 468                 tag_len = header16 & 0x3f
 469                 if tag_len == 0x3f:
 470                     tag_len = struct.unpack('<I', content[pos:pos+4])[0]
 471                     pos += 4
 472                 assert pos+tag_len <= len(content)
 473                 yield (tag_code, content[pos:pos+tag_len])
 474                 pos += tag_len
 475
 476         code_tag = next(tag
 477                         for tag_code, tag in extract_tags(content)
 478                         if tag_code == 82)
 479         p = code_tag.index(b'\0', 4) + 1
 480         code_reader = io.BytesIO(code_tag[p:])
 481
 482         # Parse ABC (AVM2 ByteCode)
 483         def read_int(reader=None):
 484             if reader is None:
 485                 reader = code_reader
 486             res = 0
 487             shift = 0
 488             for _ in range(5):
 489                 buf = reader.read(1)
 490                 assert len(buf) == 1
 491                 b = struct.unpack('<B', buf)[0]
 492                 res = res | ((b & 0x7f) << shift)
 493                 if b & 0x80 == 0:
 494                     break
 495                 shift += 7
 496             return res
 497
 498         def u30(reader=None):
 499             res = read_int(reader)
 500             assert res & 0xf0000000 == 0
 501             return res
 502         u32 = read_int
 503
 504         def s32(reader=None):
 505             v = read_int(reader)
 506             if v & 0x80000000 != 0:
 507                 v = - ((v ^ 0xffffffff) + 1)
 508             return v
 509
 510         def read_string(reader=None):
 511             if reader is None:
 512                 reader = code_reader
 513             slen = u30(reader)
 514             resb = reader.read(slen)
 515             assert len(resb) == slen
 516             return resb.decode('utf-8')
 517
 518         def read_bytes(count, reader=None):
 519             if reader is None:
 520                 reader = code_reader
 521             resb = reader.read(count)
 522             assert len(resb) == count
 523             return resb
 524
 525         def read_byte(reader=None):
 526             resb = read_bytes(1, reader=reader)
 527             res = struct.unpack('<B', resb)[0]
 528             return res
 529
 530         # minor_version + major_version
 531         read_bytes(2 + 2)
 532
 533         # Constant pool
 534         int_count = u30()
 535         for _c in range(1, int_count):
 536             s32()
 537         uint_count = u30()
 538         for _c in range(1, uint_count):
 539             u32()
 540         double_count = u30()
 541         read_bytes((double_count-1) * 8)
 542         string_count = u30()
 543         constant_strings = [u'']
 544         for _c in range(1, string_count):
 545             s = read_string()
 546             constant_strings.append(s)
 547         namespace_count = u30()
 548         for _c in range(1, namespace_count):
 549             read_bytes(1)  # kind
 550             u30()  # name
 551         ns_set_count = u30()
 552         for _c in range(1, ns_set_count):
 553             count = u30()
 554             for _c2 in range(count):
 555                 u30()
 556         multiname_count = u30()
 557         MULTINAME_SIZES = {
 558             0x07: 2,  # QName
 559             0x0d: 2,  # QNameA
 560             0x0f: 1,  # RTQName
 561             0x10: 1,  # RTQNameA
 562             0x11: 0,  # RTQNameL
 563             0x12: 0,  # RTQNameLA
 564             0x09: 2,  # Multiname
 565             0x0e: 2,  # MultinameA
 566             0x1b: 1,  # MultinameL
 567             0x1c: 1,  # MultinameLA
 568         }
 569         multinames = [u'']
 570         for _c in range(1, multiname_count):
 571             kind = u30()
 572             assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
 573             if kind == 0x07:
 574                 u30()  # namespace_idx
 575                 name_idx = u30()
 576                 multinames.append(constant_strings[name_idx])
 577             else:
 578                 multinames.append('[MULTINAME kind: %d]' % kind)
 579                 for _c2 in range(MULTINAME_SIZES[kind]):
 580                     u30()
 581
 582         # Methods
 583         method_count = u30()
 584         MethodInfo = collections.namedtuple(
 585             'MethodInfo',
 586             ['NEED_ARGUMENTS', 'NEED_REST'])
 587         method_infos = []
 588         for method_id in range(method_count):
 589             param_count = u30()
 590             u30()  # return type
 591             for _ in range(param_count):
 592                 u30()  # param type
 593             u30()  # name index (always 0 for youtube)
 594             flags = read_byte()
 595             if flags & 0x08 != 0:
 596                 # Options present
 597                 option_count = u30()
 598                 for c in range(option_count):
 599                     u30()  # val
 600                     read_bytes(1)  # kind
 601             if flags & 0x80 != 0:
 602                 # Param names present
 603                 for _ in range(param_count):
 604                     u30()  # param name
 605             mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
 606             method_infos.append(mi)
 607
 608         # Metadata
 609         metadata_count = u30()
 610         for _c in range(metadata_count):
 611             u30()  # name
 612             item_count = u30()
 613             for _c2 in range(item_count):
 614                 u30()  # key
 615                 u30()  # value
 616
 617         def parse_traits_info():
 618             trait_name_idx = u30()
 619             kind_full = read_byte()
 620             kind = kind_full & 0x0f
 621             attrs = kind_full >> 4
 622             methods = {}
 623             if kind in [0x00, 0x06]:  # Slot or Const
 624                 u30()  # Slot id
 625                 u30()  # type_name_idx
 626                 vindex = u30()
 627                 if vindex != 0:
 628                     read_byte()  # vkind
 629             elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter
 630                 u30()  # disp_id
 631                 method_idx = u30()
 632                 methods[multinames[trait_name_idx]] = method_idx
 633             elif kind == 0x04:  # Class
 634                 u30()  # slot_id
 635                 u30()  # classi
 636             elif kind == 0x05:  # Function
 637                 u30()  # slot_id
 638                 function_idx = u30()
 639                 methods[function_idx] = multinames[trait_name_idx]
 640             else:
 641                 raise ExtractorError(u'Unsupported trait kind %d' % kind)
 642
 643             if attrs & 0x4 != 0:  # Metadata present
 644                 metadata_count = u30()
 645                 for _c3 in range(metadata_count):
 646                     u30()  # metadata index
 647
 648             return methods
 649
 650         # Classes
 651         TARGET_CLASSNAME = u'SignatureDecipher'
 652         searched_idx = multinames.index(TARGET_CLASSNAME)
 653         searched_class_id = None
 654         class_count = u30()
 655         for class_id in range(class_count):
 656             name_idx = u30()
 657             if name_idx == searched_idx:
 658                 # We found the class we're looking for!
 659                 searched_class_id = class_id
 660             u30()  # super_name idx
 661             flags = read_byte()
 662             if flags & 0x08 != 0:  # Protected namespace is present
 663                 u30()  # protected_ns_idx
 664             intrf_count = u30()
 665             for _c2 in range(intrf_count):
 666                 u30()
 667             u30()  # iinit
 668             trait_count = u30()
 669             for _c2 in range(trait_count):
 670                 parse_traits_info()
 671
 672         if searched_class_id is None:
 673             raise ExtractorError(u'Target class %r not found' %
 674                                  TARGET_CLASSNAME)
 675
 676         method_names = {}
 677         method_idxs = {}
 678         for class_id in range(class_count):
 679             u30()  # cinit
 680             trait_count = u30()
 681             for _c2 in range(trait_count):
 682                 trait_methods = parse_traits_info()
 683                 if class_id == searched_class_id:
 684                     method_names.update(trait_methods.items())
 685                     method_idxs.update(dict(
 686                         (idx, name)
 687                         for name, idx in trait_methods.items()))
 688
 689         # Scripts
 690         script_count = u30()
 691         for _c in range(script_count):
 692             u30()  # init
 693             trait_count = u30()
 694             for _c2 in range(trait_count):
 695                 parse_traits_info()
 696
 697         # Method bodies
 698         method_body_count = u30()
 699         Method = collections.namedtuple('Method', ['code', 'local_count'])
 700         methods = {}
 701         for _c in range(method_body_count):
 702             method_idx = u30()
 703             u30()  # max_stack
 704             local_count = u30()
 705             u30()  # init_scope_depth
 706             u30()  # max_scope_depth
 707             code_length = u30()
 708             code = read_bytes(code_length)
 709             if method_idx in method_idxs:
 710                 m = Method(code, local_count)
 711                 methods[method_idxs[method_idx]] = m
 712             exception_count = u30()
 713             for _c2 in range(exception_count):
 714                 u30()  # from
 715                 u30()  # to
 716                 u30()  # target
 717                 u30()  # exc_type
 718                 u30()  # var_name
 719             trait_count = u30()
 720             for _c2 in range(trait_count):
 721                 parse_traits_info()
 722
 723         assert p + code_reader.tell() == len(code_tag)
 724         assert len(methods) == len(method_idxs)
 725
 726         method_pyfunctions = {}
 727
 728         def extract_function(func_name):
 729             if func_name in method_pyfunctions:
 730                 return method_pyfunctions[func_name]
 731             if func_name not in methods:
 732                 raise ExtractorError(u'Cannot find function %r' % func_name)
 733             m = methods[func_name]
 734
 735             def resfunc(args):
 736                 registers = ['(this)'] + list(args) + [None] * m.local_count
 737                 stack = []
 738                 coder = io.BytesIO(m.code)
 739                 while True:
 740                     opcode = struct.unpack('!B', coder.read(1))[0]
 741                     if opcode == 36:  # pushbyte
 742                         v = struct.unpack('!B', coder.read(1))[0]
 743                         stack.append(v)
 744                     elif opcode == 44:  # pushstring
 745                         idx = u30(coder)
 746                         stack.append(constant_strings[idx])
 747                     elif opcode == 48:  # pushscope
 748                         # We don't implement the scope register, so we'll just
 749                         # ignore the popped value
 750                         stack.pop()
 751                     elif opcode == 70:  # callproperty
 752                         index = u30(coder)
 753                         mname = multinames[index]
 754                         arg_count = u30(coder)
 755                         args = list(reversed(
 756                             [stack.pop() for _ in range(arg_count)]))
 757                         obj = stack.pop()
 758                         if mname == u'split':
 759                             assert len(args) == 1
 760                             assert isinstance(args[0], compat_str)
 761                             assert isinstance(obj, compat_str)
 762                             if args[0] == u'':
 763                                 res = list(obj)
 764                             else:
 765                                 res = obj.split(args[0])
 766                             stack.append(res)
 767                         elif mname == u'slice':
 768                             assert len(args) == 1
 769                             assert isinstance(args[0], int)
 770                             assert isinstance(obj, list)
 771                             res = obj[args[0]:]
 772                             stack.append(res)
 773                         elif mname == u'join':
 774                             assert len(args) == 1
 775                             assert isinstance(args[0], compat_str)
 776                             assert isinstance(obj, list)
 777                             res = args[0].join(obj)
 778                             stack.append(res)
 779                         elif mname in method_pyfunctions:
 780                             stack.append(method_pyfunctions[mname](args))
 781                         else:
 782                             raise NotImplementedError(
 783                                 u'Unsupported property %r on %r'
 784                                 % (mname, obj))
 785                     elif opcode == 72:  # returnvalue
 786                         res = stack.pop()
 787                         return res
 788                     elif opcode == 79:  # callpropvoid
 789                         index = u30(coder)
 790                         mname = multinames[index]
 791                         arg_count = u30(coder)
 792                         args = list(reversed(
 793                             [stack.pop() for _ in range(arg_count)]))
 794                         obj = stack.pop()
 795                         if mname == u'reverse':
 796                             assert isinstance(obj, list)
 797                             obj.reverse()
 798                         else:
 799                             raise NotImplementedError(
 800                                 u'Unsupported (void) property %r on %r'
 801                                 % (mname, obj))
 802                     elif opcode == 86:  # newarray
 803                         arg_count = u30(coder)
 804                         arr = []
 805                         for i in range(arg_count):
 806                             arr.append(stack.pop())
 807                         arr = arr[::-1]
 808                         stack.append(arr)
 809                     elif opcode == 93:  # findpropstrict
 810                         index = u30(coder)
 811                         mname = multinames[index]
 812                         res = extract_function(mname)
 813                         stack.append(res)
 814                     elif opcode == 97:  # setproperty
 815                         index = u30(coder)
 816                         value = stack.pop()
 817                         idx = stack.pop()
 818                         obj = stack.pop()
 819                         assert isinstance(obj, list)
 820                         assert isinstance(idx, int)
 821                         obj[idx] = value
 822                     elif opcode == 98:  # getlocal
 823                         index = u30(coder)
 824                         stack.append(registers[index])
 825                     elif opcode == 99:  # setlocal
 826                         index = u30(coder)
 827                         value = stack.pop()
 828                         registers[index] = value
 829                     elif opcode == 102:  # getproperty
 830                         index = u30(coder)
 831                         pname = multinames[index]
 832                         if pname == u'length':
 833                             obj = stack.pop()
 834                             assert isinstance(obj, list)
 835                             stack.append(len(obj))
 836                         else:  # Assume attribute access
 837                             idx = stack.pop()
 838                             assert isinstance(idx, int)
 839                             obj = stack.pop()
 840                             assert isinstance(obj, list)
 841                             stack.append(obj[idx])
 842                     elif opcode == 128:  # coerce
 843                         u30(coder)
 844                     elif opcode == 133:  # coerce_s
 845                         assert isinstance(stack[-1], (type(None), compat_str))
 846                     elif opcode == 164:  # modulo
 847                         value2 = stack.pop()
 848                         value1 = stack.pop()
 849                         res = value1 % value2
 850                         stack.append(res)
 851                     elif opcode == 208:  # getlocal_0
 852                         stack.append(registers[0])
 853                     elif opcode == 209:  # getlocal_1
 854                         stack.append(registers[1])
 855                     elif opcode == 210:  # getlocal_2
 856                         stack.append(registers[2])
 857                     elif opcode == 211:  # getlocal_3
 858                         stack.append(registers[3])
 859                     elif opcode == 214:  # setlocal_2
 860                         registers[2] = stack.pop()
 861                     elif opcode == 215:  # setlocal_3
 862                         registers[3] = stack.pop()
 863                     else:
 864                         raise NotImplementedError(
 865                             u'Unsupported opcode %d' % opcode)
 866
 867             method_pyfunctions[func_name] = resfunc
 868             return resfunc
 869
 870         initial_function = extract_function(u'decipher')
 871         return lambda s: initial_function([s])
 872
 873     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 874         """Turn the encrypted s field into a working signature"""
 875
 876         if player_url is None:
 877             raise ExtractorError(u'Cannot decrypt signature without player_url')
 878
 879         if player_url.startswith(u'//'):
 880             player_url = u'https:' + player_url
 881         try:
 882             player_id = (player_url, len(s))
 883             if player_id not in self._player_cache:
 884                 func = self._extract_signature_function(
 885                     video_id, player_url, len(s)
 886                 )
 887                 self._player_cache[player_id] = func
 888             func = self._player_cache[player_id]
 889             if self._downloader.params.get('youtube_print_sig_code'):
 890                 self._print_sig_code(func, len(s))
 891             return func(s)
 892         except Exception as e:
 893             tb = traceback.format_exc()
 894             raise ExtractorError(
 895                 u'Automatic signature extraction failed: ' + tb, cause=e)
 896
 897     def _get_available_subtitles(self, video_id, webpage):
 898         try:
 899             sub_list = self._download_webpage(
 900                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 901                 video_id, note=False)
 902         except ExtractorError as err:
 903             self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 904             return {}
 905         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 906
 907         sub_lang_list = {}
 908         for l in lang_list:
 909             lang = l[1]
 910             params = compat_urllib_parse.urlencode({
 911                 'lang': lang,
 912                 'v': video_id,
 913                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 914                 'name': unescapeHTML(l[0]).encode('utf-8'),
 915             })
 916             url = u'https://www.youtube.com/api/timedtext?' + params
 917             sub_lang_list[lang] = url
 918         if not sub_lang_list:
 919             self._downloader.report_warning(u'video doesn\'t have subtitles')
 920             return {}
 921         return sub_lang_list
 922
 923     def _get_available_automatic_caption(self, video_id, webpage):
 924         """We need the webpage for getting the captions url, pass it as an
 925            argument to speed up the process."""
 926         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 927         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 928         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 929         err_msg = u'Couldn\'t find automatic captions for %s' % video_id
 930         if mobj is None:
 931             self._downloader.report_warning(err_msg)
 932             return {}
 933         player_config = json.loads(mobj.group(1))
 934         try:
 935             args = player_config[u'args']
 936             caption_url = args[u'ttsurl']
 937             timestamp = args[u'timestamp']
 938             # We get the available subtitles
 939             list_params = compat_urllib_parse.urlencode({
 940                 'type': 'list',
 941                 'tlangs': 1,
 942                 'asrs': 1,
 943             })
 944             list_url = caption_url + '&' + list_params
 945             caption_list = self._download_xml(list_url, video_id)
 946             original_lang_node = caption_list.find('track')
 947             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
 948                 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
 949                 return {}
 950             original_lang = original_lang_node.attrib['lang_code']
 951
 952             sub_lang_list = {}
 953             for lang_node in caption_list.findall('target'):
 954                 sub_lang = lang_node.attrib['lang_code']
 955                 params = compat_urllib_parse.urlencode({
 956                     'lang': original_lang,
 957                     'tlang': sub_lang,
 958                     'fmt': sub_format,
 959                     'ts': timestamp,
 960                     'kind': 'asr',
 961                 })
 962                 sub_lang_list[sub_lang] = caption_url + '&' + params
 963             return sub_lang_list
 964         # An extractor error can be raise by the download process if there are
 965         # no automatic captions but there are subtitles
 966         except (KeyError, ExtractorError):
 967             self._downloader.report_warning(err_msg)
 968             return {}
 969
 970     @classmethod
 971     def extract_id(cls, url):
 972         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 973         if mobj is None:
 974             raise ExtractorError(u'Invalid URL: %s' % url)
 975         video_id = mobj.group(2)
 976         return video_id
 977
 978     def _extract_from_m3u8(self, manifest_url, video_id):
 979         url_map = {}
 980         def _get_urls(_manifest):
 981             lines = _manifest.split('\n')
 982             urls = filter(lambda l: l and not l.startswith('#'),
 983                             lines)
 984             return urls
 985         manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
 986         formats_urls = _get_urls(manifest)
 987         for format_url in formats_urls:
 988             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 989             url_map[itag] = format_url
 990         return url_map
 991
 992     def _extract_annotations(self, video_id):
 993         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 994         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
 995
 996     def _real_extract(self, url):
 997         proto = (
 998             u'http' if self._downloader.params.get('prefer_insecure', False)
 999             else u'https')
1000
1001         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1002         mobj = re.search(self._NEXT_URL_RE, url)
1003         if mobj:
1004             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1005         video_id = self.extract_id(url)
1006
1007         # Get video webpage
1008         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1009         video_webpage = self._download_webpage(url, video_id)
1010
1011         # Attempt to extract SWF player URL
1012         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1013         if mobj is not None:
1014             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1015         else:
1016             player_url = None
1017
1018         # Get video info
1019         self.report_video_info_webpage_download(video_id)
1020         if re.search(r'player-age-gate-content">', video_webpage) is not None:
1021             self.report_age_confirmation()
1022             age_gate = True
1023             # We simulate the access to the video from www.youtube.com/v/{video_id}
1024             # this can be viewed without login into Youtube
1025             data = compat_urllib_parse.urlencode({'video_id': video_id,
1026                                                   'el': 'player_embedded',
1027                                                   'gl': 'US',
1028                                                   'hl': 'en',
1029                                                   'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1030                                                   'asv': 3,
1031                                                   'sts':'1588',
1032                                                   })
1033             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1034             video_info_webpage = self._download_webpage(video_info_url, video_id,
1035                                     note=False,
1036                                     errnote='unable to download video info webpage')
1037             video_info = compat_parse_qs(video_info_webpage)
1038         else:
1039             age_gate = False
1040             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1041                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1042                         % (video_id, el_type))
1043                 video_info_webpage = self._download_webpage(video_info_url, video_id,
1044                                         note=False,
1045                                         errnote='unable to download video info webpage')
1046                 video_info = compat_parse_qs(video_info_webpage)
1047                 if 'token' in video_info:
1048                     break
1049         if 'token' not in video_info:
1050             if 'reason' in video_info:
1051                 raise ExtractorError(
1052                     u'YouTube said: %s' % video_info['reason'][0],
1053                     expected=True, video_id=video_id)
1054             else:
1055                 raise ExtractorError(
1056                     u'"token" parameter not in video info for unknown reason',
1057                     video_id=video_id)
1058
1059         if 'view_count' in video_info:
1060             view_count = int(video_info['view_count'][0])
1061         else:
1062             view_count = None
1063
1064         # Check for "rental" videos
1065         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1066             raise ExtractorError(u'"rental" videos not supported')
1067
1068         # Start extracting information
1069         self.report_information_extraction(video_id)
1070
1071         # uploader
1072         if 'author' not in video_info:
1073             raise ExtractorError(u'Unable to extract uploader name')
1074         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1075
1076         # uploader_id
1077         video_uploader_id = None
1078         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1079         if mobj is not None:
1080             video_uploader_id = mobj.group(1)
1081         else:
1082             self._downloader.report_warning(u'unable to extract uploader nickname')
1083
1084         # title
1085         if 'title' in video_info:
1086             video_title = video_info['title'][0]
1087         else:
1088             self._downloader.report_warning(u'Unable to extract video title')
1089             video_title = u'_'
1090
1091         # thumbnail image
1092         # We try first to get a high quality image:
1093         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1094                             video_webpage, re.DOTALL)
1095         if m_thumb is not None:
1096             video_thumbnail = m_thumb.group(1)
1097         elif 'thumbnail_url' not in video_info:
1098             self._downloader.report_warning(u'unable to extract video thumbnail')
1099             video_thumbnail = None
1100         else:   # don't panic if we can't find it
1101             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1102
1103         # upload date
1104         upload_date = None
1105         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
1106         if mobj is None:
1107             mobj = re.search(
1108                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
1109                 video_webpage)
1110         if mobj is not None:
1111             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1112             upload_date = unified_strdate(upload_date)
1113
1114         m_cat_container = get_element_by_id("eow-category", video_webpage)
1115         if m_cat_container:
1116             category = self._html_search_regex(
1117                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1118                 default=None)
1119             video_categories = None if category is None else [category]
1120         else:
1121             video_categories = None
1122
1123         # description
1124         video_description = get_element_by_id("eow-description", video_webpage)
1125         if video_description:
1126             video_description = re.sub(r'''(?x)
1127                 <a\s+
1128                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1129                     title="([^"]+)"\s+
1130                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1131                     class="yt-uix-redirect-link"\s*>
1132                 [^<]+
1133                 </a>
1134             ''', r'\1', video_description)
1135             video_description = clean_html(video_description)
1136         else:
1137             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1138             if fd_mobj:
1139                 video_description = unescapeHTML(fd_mobj.group(1))
1140             else:
1141                 video_description = u''
1142
1143         def _extract_count(klass):
1144             count = self._search_regex(
1145                 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1146                 video_webpage, klass, default=None)
1147             if count is not None:
1148                 return int(count.replace(',', ''))
1149             return None
1150         like_count = _extract_count(u'likes-count')
1151         dislike_count = _extract_count(u'dislikes-count')
1152
1153         # subtitles
1154         video_subtitles = self.extract_subtitles(video_id, video_webpage)
1155
1156         if self._downloader.params.get('listsubtitles', False):
1157             self._list_available_subtitles(video_id, video_webpage)
1158             return
1159
1160         if 'length_seconds' not in video_info:
1161             self._downloader.report_warning(u'unable to extract video duration')
1162             video_duration = None
1163         else:
1164             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1165
1166         # annotations
1167         video_annotations = None
1168         if self._downloader.params.get('writeannotations', False):
1169                 video_annotations = self._extract_annotations(video_id)
1170
1171         # Decide which formats to download
1172         try:
1173             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1174             if not mobj:
1175                 raise ValueError('Could not find vevo ID')
1176             json_code = uppercase_escape(mobj.group(1))
1177             ytplayer_config = json.loads(json_code)
1178             args = ytplayer_config['args']
1179             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1180             # this signatures are encrypted
1181             if 'url_encoded_fmt_stream_map' not in args:
1182                 raise ValueError(u'No stream_map present')  # caught below
1183             re_signature = re.compile(r'[&,]s=')
1184             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1185             if m_s is not None:
1186                 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1187                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1188             m_s = re_signature.search(args.get('adaptive_fmts', u''))
1189             if m_s is not None:
1190                 if 'adaptive_fmts' in video_info:
1191                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1192                 else:
1193                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1194         except ValueError:
1195             pass
1196
1197         def _map_to_format_list(urlmap):
1198             formats = []
1199             for itag, video_real_url in urlmap.items():
1200                 dct = {
1201                     'format_id': itag,
1202                     'url': video_real_url,
1203                     'player_url': player_url,
1204                 }
1205                 if itag in self._formats:
1206                     dct.update(self._formats[itag])
1207                 formats.append(dct)
1208             return formats
1209
1210         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1211             self.report_rtmp_download()
1212             formats = [{
1213                 'format_id': '_rtmp',
1214                 'protocol': 'rtmp',
1215                 'url': video_info['conn'][0],
1216                 'player_url': player_url,
1217             }]
1218         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1219             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1220             if 'rtmpe%3Dyes' in encoded_url_map:
1221                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1222             url_map = {}
1223             for url_data_str in encoded_url_map.split(','):
1224                 url_data = compat_parse_qs(url_data_str)
1225                 if 'itag' in url_data and 'url' in url_data:
1226                     url = url_data['url'][0]
1227                     if 'sig' in url_data:
1228                         url += '&signature=' + url_data['sig'][0]
1229                     elif 's' in url_data:
1230                         encrypted_sig = url_data['s'][0]
1231
1232                         if not age_gate:
1233                             jsplayer_url_json = self._search_regex(
1234                                 r'"assets":.+?"js":\s*("[^"]+")',
1235                                 video_webpage, u'JS player URL')
1236                             player_url = json.loads(jsplayer_url_json)
1237                         if player_url is None:
1238                             player_url_json = self._search_regex(
1239                                 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1240                                 video_webpage, u'age gate player URL')
1241                             player_url = json.loads(player_url_json)
1242
1243                         if self._downloader.params.get('verbose'):
1244                             if player_url is None:
1245                                 player_version = 'unknown'
1246                                 player_desc = 'unknown'
1247                             else:
1248                                 if player_url.endswith('swf'):
1249                                     player_version = self._search_regex(
1250                                         r'-(.+)\.swf$', player_url,
1251                                         u'flash player', fatal=False)
1252                                     player_desc = 'flash player %s' % player_version
1253                                 else:
1254                                     player_version = self._search_regex(
1255                                         r'html5player-(.+?)\.js', video_webpage,
1256                                         'html5 player', fatal=False)
1257                                     player_desc = u'html5 player %s' % player_version
1258
1259                             parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1260                             self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1261                                 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1262
1263                         signature = self._decrypt_signature(
1264                             encrypted_sig, video_id, player_url, age_gate)
1265                         url += '&signature=' + signature
1266                     if 'ratebypass' not in url:
1267                         url += '&ratebypass=yes'
1268                     url_map[url_data['itag'][0]] = url
1269             formats = _map_to_format_list(url_map)
1270         elif video_info.get('hlsvp'):
1271             manifest_url = video_info['hlsvp'][0]
1272             url_map = self._extract_from_m3u8(manifest_url, video_id)
1273             formats = _map_to_format_list(url_map)
1274         else:
1275             raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1276
1277         # Look for the DASH manifest
1278         if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1279             try:
1280                 # The DASH manifest used needs to be the one from the original video_webpage.
1281                 # The one found in get_video_info seems to be using different signatures.
1282                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1283                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1284                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1285                 if age_gate:
1286                     dash_manifest_url = video_info.get('dashmpd')[0]
1287                 else:
1288                     dash_manifest_url = ytplayer_config['args']['dashmpd']
1289                 def decrypt_sig(mobj):
1290                     s = mobj.group(1)
1291                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1292                     return '/signature/%s' % dec_s
1293                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1294                 dash_doc = self._download_xml(
1295                     dash_manifest_url, video_id,
1296                     note=u'Downloading DASH manifest',
1297                     errnote=u'Could not download DASH manifest')
1298                 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1299                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1300                     if url_el is None:
1301                         continue
1302                     format_id = r.attrib['id']
1303                     video_url = url_el.text
1304                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1305                     f = {
1306                         'format_id': format_id,
1307                         'url': video_url,
1308                         'width': int_or_none(r.attrib.get('width')),
1309                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1310                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1311                         'filesize': filesize,
1312                     }
1313                     try:
1314                         existing_format = next(
1315                             fo for fo in formats
1316                             if fo['format_id'] == format_id)
1317                     except StopIteration:
1318                         f.update(self._formats.get(format_id, {}))
1319                         formats.append(f)
1320                     else:
1321                         existing_format.update(f)
1322
1323             except (ExtractorError, KeyError) as e:
1324                 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1325
1326         self._sort_formats(formats)
1327
1328         return {
1329             'id':           video_id,
1330             'uploader':     video_uploader,
1331             'uploader_id':  video_uploader_id,
1332             'upload_date':  upload_date,
1333             'title':        video_title,
1334             'thumbnail':    video_thumbnail,
1335             'description':  video_description,
1336             'categories':   video_categories,
1337             'subtitles':    video_subtitles,
1338             'duration':     video_duration,
1339             'age_limit':    18 if age_gate else 0,
1340             'annotations':  video_annotations,
1341             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1342             'view_count':   view_count,
1343             'like_count': like_count,
1344             'dislike_count': dislike_count,
1345             'formats':      formats,
1346         }
1347
1348 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1349     IE_DESC = u'YouTube.com playlists'
1350     _VALID_URL = r"""(?x)(?:
1351                         (?:https?://)?
1352                         (?:\w+\.)?
1353                         youtube\.com/
1354                         (?:
1355                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1356                            \? (?:.*?&)*? (?:p|a|list)=
1357                         |  p/
1358                         )
1359                         (
1360                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1361                             # Top tracks, they can also include dots
1362                             |(?:MC)[\w\.]*
1363                         )
1364                         .*
1365                      |
1366                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1367                      )"""
1368     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1369     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1370     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1371     IE_NAME = u'youtube:playlist'
1372
1373     def _real_initialize(self):
1374         self._login()
1375
1376     def _ids_to_results(self, ids):
1377         return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1378                        for vid_id in ids]
1379
1380     def _extract_mix(self, playlist_id):
1381         # The mixes are generated from a a single video
1382         # the id of the playlist is just 'RD' + video_id
1383         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1384         webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1385         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1386         title_span = (search_title('playlist-title') or
1387             search_title('title long-title') or search_title('title'))
1388         title = clean_html(title_span)
1389         video_re = r'''(?x)data-video-username=".*?".*?
1390                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1391         ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1392         url_results = self._ids_to_results(ids)
1393
1394         return self.playlist_result(url_results, playlist_id, title)
1395
1396     def _real_extract(self, url):
1397         # Extract playlist id
1398         mobj = re.match(self._VALID_URL, url)
1399         if mobj is None:
1400             raise ExtractorError(u'Invalid URL: %s' % url)
1401         playlist_id = mobj.group(1) or mobj.group(2)
1402
1403         # Check if it's a video-specific URL
1404         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1405         if 'v' in query_dict:
1406             video_id = query_dict['v'][0]
1407             if self._downloader.params.get('noplaylist'):
1408                 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1409                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1410             else:
1411                 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1412
1413         if playlist_id.startswith('RD'):
1414             # Mixes require a custom extraction process
1415             return self._extract_mix(playlist_id)
1416         if playlist_id.startswith('TL'):
1417             raise ExtractorError(u'For downloading YouTube.com top lists, use '
1418                 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1419
1420         url = self._TEMPLATE_URL % playlist_id
1421         page = self._download_webpage(url, playlist_id)
1422         more_widget_html = content_html = page
1423
1424         # Check if the playlist exists or is private
1425         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1426             raise ExtractorError(
1427                 u'The playlist doesn\'t exist or is private, use --username or '
1428                 '--netrc to access it.',
1429                 expected=True)
1430
1431         # Extract the video ids from the playlist pages
1432         ids = []
1433
1434         for page_num in itertools.count(1):
1435             matches = re.finditer(self._VIDEO_RE, content_html)
1436             # We remove the duplicates and the link with index 0
1437             # (it's not the first video of the playlist)
1438             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1439             ids.extend(new_ids)
1440
1441             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1442             if not mobj:
1443                 break
1444
1445             more = self._download_json(
1446                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1447                 'Downloading page #%s' % page_num,
1448                 transform_source=uppercase_escape)
1449             content_html = more['content_html']
1450             more_widget_html = more['load_more_widget_html']
1451
1452         playlist_title = self._html_search_regex(
1453             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1454             page, u'title')
1455
1456         url_results = self._ids_to_results(ids)
1457         return self.playlist_result(url_results, playlist_id, playlist_title)
1458
1459
1460 class YoutubeTopListIE(YoutubePlaylistIE):
1461     IE_NAME = u'youtube:toplist'
1462     IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1463         u' (Example: "yttoplist:music:Top Tracks")')
1464     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1465
1466     def _real_extract(self, url):
1467         mobj = re.match(self._VALID_URL, url)
1468         channel = mobj.group('chann')
1469         title = mobj.group('title')
1470         query = compat_urllib_parse.urlencode({'title': title})
1471         playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1472         channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1473         link = self._html_search_regex(playlist_re, channel_page, u'list')
1474         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1475
1476         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1477         ids = []
1478         # sometimes the webpage doesn't contain the videos
1479         # retry until we get them
1480         for i in itertools.count(0):
1481             msg = u'Downloading Youtube mix'
1482             if i > 0:
1483                 msg += ', retry #%d' % i
1484             webpage = self._download_webpage(url, title, msg)
1485             ids = orderedSet(re.findall(video_re, webpage))
1486             if ids:
1487                 break
1488         url_results = self._ids_to_results(ids)
1489         return self.playlist_result(url_results, playlist_title=title)
1490
1491
1492 class YoutubeChannelIE(InfoExtractor):
1493     IE_DESC = u'YouTube.com channels'
1494     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1495     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1496     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1497     IE_NAME = u'youtube:channel'
1498
1499     def extract_videos_from_page(self, page):
1500         ids_in_page = []
1501         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1502             if mobj.group(1) not in ids_in_page:
1503                 ids_in_page.append(mobj.group(1))
1504         return ids_in_page
1505
1506     def _real_extract(self, url):
1507         # Extract channel id
1508         mobj = re.match(self._VALID_URL, url)
1509         if mobj is None:
1510             raise ExtractorError(u'Invalid URL: %s' % url)
1511
1512         # Download channel page
1513         channel_id = mobj.group(1)
1514         video_ids = []
1515         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1516         channel_page = self._download_webpage(url, channel_id)
1517         autogenerated = re.search(r'''(?x)
1518                 class="[^"]*?(?:
1519                     channel-header-autogenerated-label|
1520                     yt-channel-title-autogenerated
1521                 )[^"]*"''', channel_page) is not None
1522
1523         if autogenerated:
1524             # The videos are contained in a single page
1525             # the ajax pages can't be used, they are empty
1526             video_ids = self.extract_videos_from_page(channel_page)
1527         else:
1528             # Download all channel pages using the json-based channel_ajax query
1529             for pagenum in itertools.count(1):
1530                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1531                 page = self._download_json(
1532                     url, channel_id, note=u'Downloading page #%s' % pagenum,
1533                     transform_source=uppercase_escape)
1534
1535                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1536                 video_ids.extend(ids_in_page)
1537
1538                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1539                     break
1540
1541         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1542
1543         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1544                        for video_id in video_ids]
1545         return self.playlist_result(url_entries, channel_id)
1546
1547
1548 class YoutubeUserIE(InfoExtractor):
1549     IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1550     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1551     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1552     _GDATA_PAGE_SIZE = 50
1553     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1554     IE_NAME = u'youtube:user'
1555
1556     @classmethod
1557     def suitable(cls, url):
1558         # Don't return True if the url can be extracted with other youtube
1559         # extractor, the regex would is too permissive and it would match.
1560         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1561         if any(ie.suitable(url) for ie in other_ies): return False
1562         else: return super(YoutubeUserIE, cls).suitable(url)
1563
1564     def _real_extract(self, url):
1565         # Extract username
1566         mobj = re.match(self._VALID_URL, url)
1567         if mobj is None:
1568             raise ExtractorError(u'Invalid URL: %s' % url)
1569
1570         username = mobj.group(1)
1571
1572         # Download video ids using YouTube Data API. Result size per
1573         # query is limited (currently to 50 videos) so we need to query
1574         # page by page until there are no video ids - it means we got
1575         # all of them.
1576
1577         def download_page(pagenum):
1578             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1579
1580             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1581             page = self._download_webpage(
1582                 gdata_url, username,
1583                 u'Downloading video ids from %d to %d' % (
1584                     start_index, start_index + self._GDATA_PAGE_SIZE))
1585
1586             try:
1587                 response = json.loads(page)
1588             except ValueError as err:
1589                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1590             if 'entry' not in response['feed']:
1591                 return
1592
1593             # Extract video identifiers
1594             entries = response['feed']['entry']
1595             for entry in entries:
1596                 title = entry['title']['$t']
1597                 video_id = entry['id']['$t'].split('/')[-1]
1598                 yield {
1599                     '_type': 'url',
1600                     'url': video_id,
1601                     'ie_key': 'Youtube',
1602                     'id': video_id,
1603                     'title': title,
1604                 }
1605         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1606
1607         return self.playlist_result(url_results, playlist_title=username)
1608
1609
1610 class YoutubeSearchIE(SearchInfoExtractor):
1611     IE_DESC = u'YouTube.com searches'
1612     _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1613     _MAX_RESULTS = 1000
1614     IE_NAME = u'youtube:search'
1615     _SEARCH_KEY = 'ytsearch'
1616
1617     def _get_n_results(self, query, n):
1618         """Get a specified number of results for a query"""
1619
1620         video_ids = []
1621         pagenum = 0
1622         limit = n
1623         PAGE_SIZE = 50
1624
1625         while (PAGE_SIZE * pagenum) < limit:
1626             result_url = self._API_URL % (
1627                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1628                 (PAGE_SIZE * pagenum) + 1)
1629             data_json = self._download_webpage(
1630                 result_url, video_id=u'query "%s"' % query,
1631                 note=u'Downloading page %s' % (pagenum + 1),
1632                 errnote=u'Unable to download API page')
1633             data = json.loads(data_json)
1634             api_response = data['data']
1635
1636             if 'items' not in api_response:
1637                 raise ExtractorError(
1638                     u'[youtube] No video results', expected=True)
1639
1640             new_ids = list(video['id'] for video in api_response['items'])
1641             video_ids += new_ids
1642
1643             limit = min(n, api_response['totalItems'])
1644             pagenum += 1
1645
1646         if len(video_ids) > n:
1647             video_ids = video_ids[:n]
1648         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1649                   for video_id in video_ids]
1650         return self.playlist_result(videos, query)
1651
1652
1653 class YoutubeSearchDateIE(YoutubeSearchIE):
1654     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1655     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1656     _SEARCH_KEY = 'ytsearchdate'
1657     IE_DESC = u'YouTube.com searches, newest videos first'
1658
1659
1660 class YoutubeSearchURLIE(InfoExtractor):
1661     IE_DESC = u'YouTube.com search URLs'
1662     IE_NAME = u'youtube:search_url'
1663     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1664
1665     def _real_extract(self, url):
1666         mobj = re.match(self._VALID_URL, url)
1667         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1668
1669         webpage = self._download_webpage(url, query)
1670         result_code = self._search_regex(
1671             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
1672
1673         part_codes = re.findall(
1674             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1675         entries = []
1676         for part_code in part_codes:
1677             part_title = self._html_search_regex(
1678                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1679             part_url_snippet = self._html_search_regex(
1680                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1681             part_url = compat_urlparse.urljoin(
1682                 'https://www.youtube.com/', part_url_snippet)
1683             entries.append({
1684                 '_type': 'url',
1685                 'url': part_url,
1686                 'title': part_title,
1687             })
1688
1689         return {
1690             '_type': 'playlist',
1691             'entries': entries,
1692             'title': query,
1693         }
1694
1695
1696 class YoutubeShowIE(InfoExtractor):
1697     IE_DESC = u'YouTube.com (multi-season) shows'
1698     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1699     IE_NAME = u'youtube:show'
1700
1701     def _real_extract(self, url):
1702         mobj = re.match(self._VALID_URL, url)
1703         show_name = mobj.group(1)
1704         webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1705         # There's one playlist for each season of the show
1706         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1707         self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1708         return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1709
1710
1711 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1712     """
1713     Base class for extractors that fetch info from
1714     http://www.youtube.com/feed_ajax
1715     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1716     """
1717     _LOGIN_REQUIRED = True
1718     # use action_load_personal_feed instead of action_load_system_feed
1719     _PERSONAL_FEED = False
1720
1721     @property
1722     def _FEED_TEMPLATE(self):
1723         action = 'action_load_system_feed'
1724         if self._PERSONAL_FEED:
1725             action = 'action_load_personal_feed'
1726         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1727
1728     @property
1729     def IE_NAME(self):
1730         return u'youtube:%s' % self._FEED_NAME
1731
1732     def _real_initialize(self):
1733         self._login()
1734
1735     def _real_extract(self, url):
1736         feed_entries = []
1737         paging = 0
1738         for i in itertools.count(1):
1739             info = self._download_json(self._FEED_TEMPLATE % paging,
1740                                           u'%s feed' % self._FEED_NAME,
1741                                           u'Downloading page %s' % i)
1742             feed_html = info.get('feed_html') or info.get('content_html')
1743             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1744             ids = orderedSet(m.group(1) for m in m_ids)
1745             feed_entries.extend(
1746                 self.url_result(video_id, 'Youtube', video_id=video_id)
1747                 for video_id in ids)
1748             mobj = re.search(
1749                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1750                 feed_html)
1751             if mobj is None:
1752                 break
1753             paging = mobj.group('paging')
1754         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1755
1756 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1757     IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1758     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1759     _FEED_NAME = 'subscriptions'
1760     _PLAYLIST_TITLE = u'Youtube Subscriptions'
1761
1762 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1763     IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1764     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1765     _FEED_NAME = 'recommended'
1766     _PLAYLIST_TITLE = u'Youtube Recommended videos'
1767
1768 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1769     IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1770     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1771     _FEED_NAME = 'watch_later'
1772     _PLAYLIST_TITLE = u'Youtube Watch Later'
1773     _PERSONAL_FEED = True
1774
1775 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1776     IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1777     _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1778     _FEED_NAME = 'history'
1779     _PERSONAL_FEED = True
1780     _PLAYLIST_TITLE = u'Youtube Watch History'
1781
1782 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1783     IE_NAME = u'youtube:favorites'
1784     IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1785     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1786     _LOGIN_REQUIRED = True
1787
1788     def _real_extract(self, url):
1789         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1790         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1791         return self.url_result(playlist_id, 'YoutubePlaylist')
1792
1793
1794 class YoutubeTruncatedURLIE(InfoExtractor):
1795     IE_NAME = 'youtube:truncated_url'
1796     IE_DESC = False  # Do not list
1797     _VALID_URL = r'''(?x)
1798         (?:https?://)?[^/]+/watch\?(?:
1799             feature=[a-z_]+|
1800             annotation_id=annotation_[^&]+
1801         )?$|
1802         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1803     '''
1804
1805     _TESTS = [{
1806         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1807         'only_matching': True,
1808     }, {
1809         'url': 'http://www.youtube.com/watch?',
1810         'only_matching': True,
1811     }]
1812
1813     def _real_extract(self, url):
1814         raise ExtractorError(
1815             u'Did you forget to quote the URL? Remember that & is a meta '
1816             u'character in most shells, so you want to put the URL in quotes, '
1817             u'like  youtube-dl '
1818             u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1819             u' or simply  youtube-dl BaW_jenozKc  .',
1820             expected=True)