_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 import collections
   4 import errno
   5 import io
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import struct
  11 import traceback
  12 import zlib
  13
  14 from .common import InfoExtractor, SearchInfoExtractor
  15 from .subtitles import SubtitlesInfoExtractor
  16 from ..jsinterp import JSInterpreter
  17 from ..utils import (
  18     compat_chr,
  19     compat_parse_qs,
  20     compat_urllib_parse,
  21     compat_urllib_request,
  22     compat_urlparse,
  23     compat_str,
  24
  25     clean_html,
  26     get_cachedir,
  27     get_element_by_id,
  28     get_element_by_attribute,
  29     ExtractorError,
  30     int_or_none,
  31     PagedList,
  32     unescapeHTML,
  33     unified_strdate,
  34     orderedSet,
  35     write_json_file,
  36     uppercase_escape,
  37 )
  38
  39 class YoutubeBaseInfoExtractor(InfoExtractor):
  40     """Provide base functions for Youtube extractors"""
  41     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  42     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  43     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  44     _NETRC_MACHINE = 'youtube'
  45     # If True it will raise an error if no login info is provided
  46     _LOGIN_REQUIRED = False
  47
  48     def _set_language(self):
  49         return bool(self._download_webpage(
  50             self._LANG_URL, None,
  51             note=u'Setting language', errnote='unable to set language',
  52             fatal=False))
  53
  54     def _login(self):
  55         (username, password) = self._get_login_info()
  56         # No authentication to be performed
  57         if username is None:
  58             if self._LOGIN_REQUIRED:
  59                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  60             return False
  61
  62         login_page = self._download_webpage(
  63             self._LOGIN_URL, None,
  64             note=u'Downloading login page',
  65             errnote=u'unable to fetch login page', fatal=False)
  66         if login_page is False:
  67             return
  68
  69         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  70                                   login_page, u'Login GALX parameter')
  71
  72         # Log in
  73         login_form_strs = {
  74                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  75                 u'Email': username,
  76                 u'GALX': galx,
  77                 u'Passwd': password,
  78                 u'PersistentCookie': u'yes',
  79                 u'_utf8': u'霱',
  80                 u'bgresponse': u'js_disabled',
  81                 u'checkConnection': u'',
  82                 u'checkedDomains': u'youtube',
  83                 u'dnConn': u'',
  84                 u'pstMsg': u'0',
  85                 u'rmShown': u'1',
  86                 u'secTok': u'',
  87                 u'signIn': u'Sign in',
  88                 u'timeStmp': u'',
  89                 u'service': u'youtube',
  90                 u'uilel': u'3',
  91                 u'hl': u'en_US',
  92         }
  93         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
  94         # chokes on unicode
  95         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
  96         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
  97
  98         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
  99         login_results = self._download_webpage(
 100             req, None,
 101             note=u'Logging in', errnote=u'unable to log in', fatal=False)
 102         if login_results is False:
 103             return False
 104         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 105             self._downloader.report_warning(u'unable to log in: bad username or password')
 106             return False
 107         return True
 108
 109     def _confirm_age(self):
 110         age_form = {
 111             'next_url': '/',
 112             'action_confirm': 'Confirm',
 113         }
 114         req = compat_urllib_request.Request(self._AGE_URL,
 115             compat_urllib_parse.urlencode(age_form).encode('ascii'))
 116
 117         self._download_webpage(
 118             req, None,
 119             note=u'Confirming age', errnote=u'Unable to confirm age')
 120         return True
 121
 122     def _real_initialize(self):
 123         if self._downloader is None:
 124             return
 125         if not self._set_language():
 126             return
 127         if not self._login():
 128             return
 129         self._confirm_age()
 130
 131
 132 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 133     IE_DESC = u'YouTube.com'
 134     _VALID_URL = r"""(?x)^
 135                      (
 136                          (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
 137                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 138                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 139                             (?:www\.)?pwnyoutube\.com/|
 140                             (?:www\.)?yourepeat\.com/|
 141                             tube\.majestyc\.net/|
 142                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 143                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 144                          (?:                                                  # the various things that can precede the ID:
 145                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 146                              |(?:                                             # or the v= param in all its forms
 147                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 148                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 149                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 150                                  v=
 151                              )
 152                          ))
 153                          |youtu\.be/                                          # just youtu.be/xxxx
 154                          |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 155                          )
 156                      )?                                                       # all until now is optional -> you can pass the naked ID
 157                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 158                      (?(1).+)?                                                # if we found the ID, everything can follow
 159                      $"""
 160     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 161     _formats = {
 162         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 163         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 164         '13': {'ext': '3gp'},
 165         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 166         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 167         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 168         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 169         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 170         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 171         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 172         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 173         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 174         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 175         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 176         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 177
 178
 179         # 3d videos
 180         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 181         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 182         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 183         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 184         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 185         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 186         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 187
 188         # Apple HTTP Live Streaming
 189         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 190         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 191         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 192         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 193         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 194         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 195         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 196
 197         # DASH mp4 video
 198         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 199         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 200         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 201         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 202         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 203         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 204         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 205         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 206
 207         # Dash mp4 audio
 208         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 209         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 210         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 211
 212         # Dash webm
 213         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 214         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 215         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 216         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 217         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 218         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 219         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 220         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 221         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 222         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 223         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 224         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 225         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 226         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 227         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 228
 229         # Dash webm audio
 230         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
 231         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 232
 233         # RTMP (unnamed)
 234         '_rtmp': {'protocol': 'rtmp'},
 235     }
 236
 237     IE_NAME = u'youtube'
 238     _TESTS = [
 239         {
 240             u"url":  u"http://www.youtube.com/watch?v=BaW_jenozKc",
 241             u"file":  u"BaW_jenozKc.mp4",
 242             u"info_dict": {
 243                 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
 244                 u"uploader": u"Philipp Hagemeister",
 245                 u"uploader_id": u"phihag",
 246                 u"upload_date": u"20121002",
 247                 u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
 248                 u"categories": [u'Science & Technology'],
 249             }
 250         },
 251         {
 252             u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
 253             u"file":  u"UxxajLWwzqY.mp4",
 254             u"note": u"Test generic use_cipher_signature video (#897)",
 255             u"info_dict": {
 256                 u"upload_date": u"20120506",
 257                 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
 258                 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
 259                 u"uploader": u"Icona Pop",
 260                 u"uploader_id": u"IconaPop"
 261             }
 262         },
 263         {
 264             u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
 265             u"file":  u"07FYdnEawAQ.mp4",
 266             u"note": u"Test VEVO video with age protection (#956)",
 267             u"info_dict": {
 268                 u"upload_date": u"20130703",
 269                 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
 270                 u"description": u"md5:64249768eec3bc4276236606ea996373",
 271                 u"uploader": u"justintimberlakeVEVO",
 272                 u"uploader_id": u"justintimberlakeVEVO"
 273             }
 274         },
 275         {
 276             u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
 277             u"file":  u"yZIXLfi8CZQ.mp4",
 278             u"note": u"Embed-only video (#1746)",
 279             u"info_dict": {
 280                 u"upload_date": u"20120608",
 281                 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
 282                 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
 283                 u"uploader": u"SET India",
 284                 u"uploader_id": u"setindia"
 285             }
 286         },
 287         {
 288             u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
 289             u"file": u"a9LDPn-MO4I.m4a",
 290             u"note": u"256k DASH audio (format 141) via DASH manifest",
 291             u"info_dict": {
 292                 u"upload_date": "20121002",
 293                 u"uploader_id": "8KVIDEO",
 294                 u"description": "No description available.",
 295                 u"uploader": "8KVIDEO",
 296                 u"title": "UHDTV TEST 8K VIDEO.mp4"
 297             },
 298             u"params": {
 299                 u"youtube_include_dash_manifest": True,
 300                 u"format": "141",
 301             },
 302         },
 303         # DASH manifest with encrypted signature
 304         {
 305             u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 306             u'info_dict': {
 307                 u'id': u'IB3lcPjvWLA',
 308                 u'ext': u'm4a',
 309                 u'title': u'Afrojack - The Spark ft. Spree Wilson',
 310                 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
 311                 u'uploader': u'AfrojackVEVO',
 312                 u'uploader_id': u'AfrojackVEVO',
 313                 u'upload_date': u'20131011',
 314             },
 315             u"params": {
 316                 u'youtube_include_dash_manifest': True,
 317                 u'format': '141',
 318             },
 319         },
 320     ]
 321
 322
 323     @classmethod
 324     def suitable(cls, url):
 325         """Receives a URL and returns True if suitable for this IE."""
 326         if YoutubePlaylistIE.suitable(url): return False
 327         return re.match(cls._VALID_URL, url) is not None
 328
 329     def __init__(self, *args, **kwargs):
 330         super(YoutubeIE, self).__init__(*args, **kwargs)
 331         self._player_cache = {}
 332
 333     def report_video_info_webpage_download(self, video_id):
 334         """Report attempt to download video info webpage."""
 335         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 336
 337     def report_information_extraction(self, video_id):
 338         """Report attempt to extract video information."""
 339         self.to_screen(u'%s: Extracting video information' % video_id)
 340
 341     def report_unavailable_format(self, video_id, format):
 342         """Report extracted video URL."""
 343         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 344
 345     def report_rtmp_download(self):
 346         """Indicate the download will use the RTMP protocol."""
 347         self.to_screen(u'RTMP download detected')
 348
 349     def _extract_signature_function(self, video_id, player_url, slen):
 350         id_m = re.match(
 351             r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3)?\.(?P<ext>[a-z]+)$',
 352             player_url)
 353         player_type = id_m.group('ext')
 354         player_id = id_m.group('id')
 355
 356         # Read from filesystem cache
 357         func_id = '%s_%s_%d' % (player_type, player_id, slen)
 358         assert os.path.basename(func_id) == func_id
 359         cache_dir = get_cachedir(self._downloader.params)
 360
 361         cache_enabled = cache_dir is not None
 362         if cache_enabled:
 363             cache_fn = os.path.join(os.path.expanduser(cache_dir),
 364                                     u'youtube-sigfuncs',
 365                                     func_id + '.json')
 366             try:
 367                 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
 368                     cache_spec = json.load(cachef)
 369                 return lambda s: u''.join(s[i] for i in cache_spec)
 370             except IOError:
 371                 pass  # No cache available
 372
 373         if player_type == 'js':
 374             code = self._download_webpage(
 375                 player_url, video_id,
 376                 note=u'Downloading %s player %s' % (player_type, player_id),
 377                 errnote=u'Download of %s failed' % player_url)
 378             res = self._parse_sig_js(code)
 379         elif player_type == 'swf':
 380             urlh = self._request_webpage(
 381                 player_url, video_id,
 382                 note=u'Downloading %s player %s' % (player_type, player_id),
 383                 errnote=u'Download of %s failed' % player_url)
 384             code = urlh.read()
 385             res = self._parse_sig_swf(code)
 386         else:
 387             assert False, 'Invalid player type %r' % player_type
 388
 389         if cache_enabled:
 390             try:
 391                 test_string = u''.join(map(compat_chr, range(slen)))
 392                 cache_res = res(test_string)
 393                 cache_spec = [ord(c) for c in cache_res]
 394                 try:
 395                     os.makedirs(os.path.dirname(cache_fn))
 396                 except OSError as ose:
 397                     if ose.errno != errno.EEXIST:
 398                         raise
 399                 write_json_file(cache_spec, cache_fn)
 400             except Exception:
 401                 tb = traceback.format_exc()
 402                 self._downloader.report_warning(
 403                     u'Writing cache to %r failed: %s' % (cache_fn, tb))
 404
 405         return res
 406
 407     def _print_sig_code(self, func, slen):
 408         def gen_sig_code(idxs):
 409             def _genslice(start, end, step):
 410                 starts = u'' if start == 0 else str(start)
 411                 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
 412                 steps = u'' if step == 1 else (u':%d' % step)
 413                 return u's[%s%s%s]' % (starts, ends, steps)
 414
 415             step = None
 416             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 417                                     # set as soon as step is set
 418             for i, prev in zip(idxs[1:], idxs[:-1]):
 419                 if step is not None:
 420                     if i - prev == step:
 421                         continue
 422                     yield _genslice(start, prev, step)
 423                     step = None
 424                     continue
 425                 if i - prev in [-1, 1]:
 426                     step = i - prev
 427                     start = prev
 428                     continue
 429                 else:
 430                     yield u's[%d]' % prev
 431             if step is None:
 432                 yield u's[%d]' % i
 433             else:
 434                 yield _genslice(start, i, step)
 435
 436         test_string = u''.join(map(compat_chr, range(slen)))
 437         cache_res = func(test_string)
 438         cache_spec = [ord(c) for c in cache_res]
 439         expr_code = u' + '.join(gen_sig_code(cache_spec))
 440         code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code)
 441         self.to_screen(u'Extracted signature function:\n' + code)
 442
 443     def _parse_sig_js(self, jscode):
 444         funcname = self._search_regex(
 445             r'signature=([$a-zA-Z]+)', jscode,
 446              u'Initial JS player signature function name')
 447
 448         jsi = JSInterpreter(jscode)
 449         initial_function = jsi.extract_function(funcname)
 450         return lambda s: initial_function([s])
 451
 452     def _parse_sig_swf(self, file_contents):
 453         if file_contents[1:3] != b'WS':
 454             raise ExtractorError(
 455                 u'Not an SWF file; header is %r' % file_contents[:3])
 456         if file_contents[:1] == b'C':
 457             content = zlib.decompress(file_contents[8:])
 458         else:
 459             raise NotImplementedError(u'Unsupported compression format %r' %
 460                                       file_contents[:1])
 461
 462         def extract_tags(content):
 463             pos = 0
 464             while pos < len(content):
 465                 header16 = struct.unpack('<H', content[pos:pos+2])[0]
 466                 pos += 2
 467                 tag_code = header16 >> 6
 468                 tag_len = header16 & 0x3f
 469                 if tag_len == 0x3f:
 470                     tag_len = struct.unpack('<I', content[pos:pos+4])[0]
 471                     pos += 4
 472                 assert pos+tag_len <= len(content)
 473                 yield (tag_code, content[pos:pos+tag_len])
 474                 pos += tag_len
 475
 476         code_tag = next(tag
 477                         for tag_code, tag in extract_tags(content)
 478                         if tag_code == 82)
 479         p = code_tag.index(b'\0', 4) + 1
 480         code_reader = io.BytesIO(code_tag[p:])
 481
 482         # Parse ABC (AVM2 ByteCode)
 483         def read_int(reader=None):
 484             if reader is None:
 485                 reader = code_reader
 486             res = 0
 487             shift = 0
 488             for _ in range(5):
 489                 buf = reader.read(1)
 490                 assert len(buf) == 1
 491                 b = struct.unpack('<B', buf)[0]
 492                 res = res | ((b & 0x7f) << shift)
 493                 if b & 0x80 == 0:
 494                     break
 495                 shift += 7
 496             return res
 497
 498         def u30(reader=None):
 499             res = read_int(reader)
 500             assert res & 0xf0000000 == 0
 501             return res
 502         u32 = read_int
 503
 504         def s32(reader=None):
 505             v = read_int(reader)
 506             if v & 0x80000000 != 0:
 507                 v = - ((v ^ 0xffffffff) + 1)
 508             return v
 509
 510         def read_string(reader=None):
 511             if reader is None:
 512                 reader = code_reader
 513             slen = u30(reader)
 514             resb = reader.read(slen)
 515             assert len(resb) == slen
 516             return resb.decode('utf-8')
 517
 518         def read_bytes(count, reader=None):
 519             if reader is None:
 520                 reader = code_reader
 521             resb = reader.read(count)
 522             assert len(resb) == count
 523             return resb
 524
 525         def read_byte(reader=None):
 526             resb = read_bytes(1, reader=reader)
 527             res = struct.unpack('<B', resb)[0]
 528             return res
 529
 530         # minor_version + major_version
 531         read_bytes(2 + 2)
 532
 533         # Constant pool
 534         int_count = u30()
 535         for _c in range(1, int_count):
 536             s32()
 537         uint_count = u30()
 538         for _c in range(1, uint_count):
 539             u32()
 540         double_count = u30()
 541         read_bytes((double_count-1) * 8)
 542         string_count = u30()
 543         constant_strings = [u'']
 544         for _c in range(1, string_count):
 545             s = read_string()
 546             constant_strings.append(s)
 547         namespace_count = u30()
 548         for _c in range(1, namespace_count):
 549             read_bytes(1)  # kind
 550             u30()  # name
 551         ns_set_count = u30()
 552         for _c in range(1, ns_set_count):
 553             count = u30()
 554             for _c2 in range(count):
 555                 u30()
 556         multiname_count = u30()
 557         MULTINAME_SIZES = {
 558             0x07: 2,  # QName
 559             0x0d: 2,  # QNameA
 560             0x0f: 1,  # RTQName
 561             0x10: 1,  # RTQNameA
 562             0x11: 0,  # RTQNameL
 563             0x12: 0,  # RTQNameLA
 564             0x09: 2,  # Multiname
 565             0x0e: 2,  # MultinameA
 566             0x1b: 1,  # MultinameL
 567             0x1c: 1,  # MultinameLA
 568         }
 569         multinames = [u'']
 570         for _c in range(1, multiname_count):
 571             kind = u30()
 572             assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
 573             if kind == 0x07:
 574                 u30()  # namespace_idx
 575                 name_idx = u30()
 576                 multinames.append(constant_strings[name_idx])
 577             else:
 578                 multinames.append('[MULTINAME kind: %d]' % kind)
 579                 for _c2 in range(MULTINAME_SIZES[kind]):
 580                     u30()
 581
 582         # Methods
 583         method_count = u30()
 584         MethodInfo = collections.namedtuple(
 585             'MethodInfo',
 586             ['NEED_ARGUMENTS', 'NEED_REST'])
 587         method_infos = []
 588         for method_id in range(method_count):
 589             param_count = u30()
 590             u30()  # return type
 591             for _ in range(param_count):
 592                 u30()  # param type
 593             u30()  # name index (always 0 for youtube)
 594             flags = read_byte()
 595             if flags & 0x08 != 0:
 596                 # Options present
 597                 option_count = u30()
 598                 for c in range(option_count):
 599                     u30()  # val
 600                     read_bytes(1)  # kind
 601             if flags & 0x80 != 0:
 602                 # Param names present
 603                 for _ in range(param_count):
 604                     u30()  # param name
 605             mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
 606             method_infos.append(mi)
 607
 608         # Metadata
 609         metadata_count = u30()
 610         for _c in range(metadata_count):
 611             u30()  # name
 612             item_count = u30()
 613             for _c2 in range(item_count):
 614                 u30()  # key
 615                 u30()  # value
 616
 617         def parse_traits_info():
 618             trait_name_idx = u30()
 619             kind_full = read_byte()
 620             kind = kind_full & 0x0f
 621             attrs = kind_full >> 4
 622             methods = {}
 623             if kind in [0x00, 0x06]:  # Slot or Const
 624                 u30()  # Slot id
 625                 u30()  # type_name_idx
 626                 vindex = u30()
 627                 if vindex != 0:
 628                     read_byte()  # vkind
 629             elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter
 630                 u30()  # disp_id
 631                 method_idx = u30()
 632                 methods[multinames[trait_name_idx]] = method_idx
 633             elif kind == 0x04:  # Class
 634                 u30()  # slot_id
 635                 u30()  # classi
 636             elif kind == 0x05:  # Function
 637                 u30()  # slot_id
 638                 function_idx = u30()
 639                 methods[function_idx] = multinames[trait_name_idx]
 640             else:
 641                 raise ExtractorError(u'Unsupported trait kind %d' % kind)
 642
 643             if attrs & 0x4 != 0:  # Metadata present
 644                 metadata_count = u30()
 645                 for _c3 in range(metadata_count):
 646                     u30()  # metadata index
 647
 648             return methods
 649
 650         # Classes
 651         TARGET_CLASSNAME = u'SignatureDecipher'
 652         searched_idx = multinames.index(TARGET_CLASSNAME)
 653         searched_class_id = None
 654         class_count = u30()
 655         for class_id in range(class_count):
 656             name_idx = u30()
 657             if name_idx == searched_idx:
 658                 # We found the class we're looking for!
 659                 searched_class_id = class_id
 660             u30()  # super_name idx
 661             flags = read_byte()
 662             if flags & 0x08 != 0:  # Protected namespace is present
 663                 u30()  # protected_ns_idx
 664             intrf_count = u30()
 665             for _c2 in range(intrf_count):
 666                 u30()
 667             u30()  # iinit
 668             trait_count = u30()
 669             for _c2 in range(trait_count):
 670                 parse_traits_info()
 671
 672         if searched_class_id is None:
 673             raise ExtractorError(u'Target class %r not found' %
 674                                  TARGET_CLASSNAME)
 675
 676         method_names = {}
 677         method_idxs = {}
 678         for class_id in range(class_count):
 679             u30()  # cinit
 680             trait_count = u30()
 681             for _c2 in range(trait_count):
 682                 trait_methods = parse_traits_info()
 683                 if class_id == searched_class_id:
 684                     method_names.update(trait_methods.items())
 685                     method_idxs.update(dict(
 686                         (idx, name)
 687                         for name, idx in trait_methods.items()))
 688
 689         # Scripts
 690         script_count = u30()
 691         for _c in range(script_count):
 692             u30()  # init
 693             trait_count = u30()
 694             for _c2 in range(trait_count):
 695                 parse_traits_info()
 696
 697         # Method bodies
 698         method_body_count = u30()
 699         Method = collections.namedtuple('Method', ['code', 'local_count'])
 700         methods = {}
 701         for _c in range(method_body_count):
 702             method_idx = u30()
 703             u30()  # max_stack
 704             local_count = u30()
 705             u30()  # init_scope_depth
 706             u30()  # max_scope_depth
 707             code_length = u30()
 708             code = read_bytes(code_length)
 709             if method_idx in method_idxs:
 710                 m = Method(code, local_count)
 711                 methods[method_idxs[method_idx]] = m
 712             exception_count = u30()
 713             for _c2 in range(exception_count):
 714                 u30()  # from
 715                 u30()  # to
 716                 u30()  # target
 717                 u30()  # exc_type
 718                 u30()  # var_name
 719             trait_count = u30()
 720             for _c2 in range(trait_count):
 721                 parse_traits_info()
 722
 723         assert p + code_reader.tell() == len(code_tag)
 724         assert len(methods) == len(method_idxs)
 725
 726         method_pyfunctions = {}
 727
 728         def extract_function(func_name):
 729             if func_name in method_pyfunctions:
 730                 return method_pyfunctions[func_name]
 731             if func_name not in methods:
 732                 raise ExtractorError(u'Cannot find function %r' % func_name)
 733             m = methods[func_name]
 734
 735             def resfunc(args):
 736                 registers = ['(this)'] + list(args) + [None] * m.local_count
 737                 stack = []
 738                 coder = io.BytesIO(m.code)
 739                 while True:
 740                     opcode = struct.unpack('!B', coder.read(1))[0]
 741                     if opcode == 36:  # pushbyte
 742                         v = struct.unpack('!B', coder.read(1))[0]
 743                         stack.append(v)
 744                     elif opcode == 44:  # pushstring
 745                         idx = u30(coder)
 746                         stack.append(constant_strings[idx])
 747                     elif opcode == 48:  # pushscope
 748                         # We don't implement the scope register, so we'll just
 749                         # ignore the popped value
 750                         stack.pop()
 751                     elif opcode == 70:  # callproperty
 752                         index = u30(coder)
 753                         mname = multinames[index]
 754                         arg_count = u30(coder)
 755                         args = list(reversed(
 756                             [stack.pop() for _ in range(arg_count)]))
 757                         obj = stack.pop()
 758                         if mname == u'split':
 759                             assert len(args) == 1
 760                             assert isinstance(args[0], compat_str)
 761                             assert isinstance(obj, compat_str)
 762                             if args[0] == u'':
 763                                 res = list(obj)
 764                             else:
 765                                 res = obj.split(args[0])
 766                             stack.append(res)
 767                         elif mname == u'slice':
 768                             assert len(args) == 1
 769                             assert isinstance(args[0], int)
 770                             assert isinstance(obj, list)
 771                             res = obj[args[0]:]
 772                             stack.append(res)
 773                         elif mname == u'join':
 774                             assert len(args) == 1
 775                             assert isinstance(args[0], compat_str)
 776                             assert isinstance(obj, list)
 777                             res = args[0].join(obj)
 778                             stack.append(res)
 779                         elif mname in method_pyfunctions:
 780                             stack.append(method_pyfunctions[mname](args))
 781                         else:
 782                             raise NotImplementedError(
 783                                 u'Unsupported property %r on %r'
 784                                 % (mname, obj))
 785                     elif opcode == 72:  # returnvalue
 786                         res = stack.pop()
 787                         return res
 788                     elif opcode == 79:  # callpropvoid
 789                         index = u30(coder)
 790                         mname = multinames[index]
 791                         arg_count = u30(coder)
 792                         args = list(reversed(
 793                             [stack.pop() for _ in range(arg_count)]))
 794                         obj = stack.pop()
 795                         if mname == u'reverse':
 796                             assert isinstance(obj, list)
 797                             obj.reverse()
 798                         else:
 799                             raise NotImplementedError(
 800                                 u'Unsupported (void) property %r on %r'
 801                                 % (mname, obj))
 802                     elif opcode == 93:  # findpropstrict
 803                         index = u30(coder)
 804                         mname = multinames[index]
 805                         res = extract_function(mname)
 806                         stack.append(res)
 807                     elif opcode == 97:  # setproperty
 808                         index = u30(coder)
 809                         value = stack.pop()
 810                         idx = stack.pop()
 811                         obj = stack.pop()
 812                         assert isinstance(obj, list)
 813                         assert isinstance(idx, int)
 814                         obj[idx] = value
 815                     elif opcode == 98:  # getlocal
 816                         index = u30(coder)
 817                         stack.append(registers[index])
 818                     elif opcode == 99:  # setlocal
 819                         index = u30(coder)
 820                         value = stack.pop()
 821                         registers[index] = value
 822                     elif opcode == 102:  # getproperty
 823                         index = u30(coder)
 824                         pname = multinames[index]
 825                         if pname == u'length':
 826                             obj = stack.pop()
 827                             assert isinstance(obj, list)
 828                             stack.append(len(obj))
 829                         else:  # Assume attribute access
 830                             idx = stack.pop()
 831                             assert isinstance(idx, int)
 832                             obj = stack.pop()
 833                             assert isinstance(obj, list)
 834                             stack.append(obj[idx])
 835                     elif opcode == 128:  # coerce
 836                         u30(coder)
 837                     elif opcode == 133:  # coerce_s
 838                         assert isinstance(stack[-1], (type(None), compat_str))
 839                     elif opcode == 164:  # modulo
 840                         value2 = stack.pop()
 841                         value1 = stack.pop()
 842                         res = value1 % value2
 843                         stack.append(res)
 844                     elif opcode == 208:  # getlocal_0
 845                         stack.append(registers[0])
 846                     elif opcode == 209:  # getlocal_1
 847                         stack.append(registers[1])
 848                     elif opcode == 210:  # getlocal_2
 849                         stack.append(registers[2])
 850                     elif opcode == 211:  # getlocal_3
 851                         stack.append(registers[3])
 852                     elif opcode == 214:  # setlocal_2
 853                         registers[2] = stack.pop()
 854                     elif opcode == 215:  # setlocal_3
 855                         registers[3] = stack.pop()
 856                     else:
 857                         raise NotImplementedError(
 858                             u'Unsupported opcode %d' % opcode)
 859
 860             method_pyfunctions[func_name] = resfunc
 861             return resfunc
 862
 863         initial_function = extract_function(u'decipher')
 864         return lambda s: initial_function([s])
 865
 866     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 867         """Turn the encrypted s field into a working signature"""
 868
 869         if player_url is None:
 870             raise ExtractorError(u'Cannot decrypt signature without player_url')
 871
 872         if player_url.startswith(u'//'):
 873             player_url = u'https:' + player_url
 874         try:
 875             player_id = (player_url, len(s))
 876             if player_id not in self._player_cache:
 877                 func = self._extract_signature_function(
 878                     video_id, player_url, len(s)
 879                 )
 880                 self._player_cache[player_id] = func
 881             func = self._player_cache[player_id]
 882             if self._downloader.params.get('youtube_print_sig_code'):
 883                 self._print_sig_code(func, len(s))
 884             return func(s)
 885         except Exception as e:
 886             tb = traceback.format_exc()
 887             raise ExtractorError(
 888                 u'Automatic signature extraction failed: ' + tb, cause=e)
 889
 890     def _get_available_subtitles(self, video_id, webpage):
 891         try:
 892             sub_list = self._download_webpage(
 893                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 894                 video_id, note=False)
 895         except ExtractorError as err:
 896             self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 897             return {}
 898         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 899
 900         sub_lang_list = {}
 901         for l in lang_list:
 902             lang = l[1]
 903             params = compat_urllib_parse.urlencode({
 904                 'lang': lang,
 905                 'v': video_id,
 906                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 907                 'name': unescapeHTML(l[0]).encode('utf-8'),
 908             })
 909             url = u'https://www.youtube.com/api/timedtext?' + params
 910             sub_lang_list[lang] = url
 911         if not sub_lang_list:
 912             self._downloader.report_warning(u'video doesn\'t have subtitles')
 913             return {}
 914         return sub_lang_list
 915
 916     def _get_available_automatic_caption(self, video_id, webpage):
 917         """We need the webpage for getting the captions url, pass it as an
 918            argument to speed up the process."""
 919         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 920         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 921         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 922         err_msg = u'Couldn\'t find automatic captions for %s' % video_id
 923         if mobj is None:
 924             self._downloader.report_warning(err_msg)
 925             return {}
 926         player_config = json.loads(mobj.group(1))
 927         try:
 928             args = player_config[u'args']
 929             caption_url = args[u'ttsurl']
 930             timestamp = args[u'timestamp']
 931             # We get the available subtitles
 932             list_params = compat_urllib_parse.urlencode({
 933                 'type': 'list',
 934                 'tlangs': 1,
 935                 'asrs': 1,
 936             })
 937             list_url = caption_url + '&' + list_params
 938             caption_list = self._download_xml(list_url, video_id)
 939             original_lang_node = caption_list.find('track')
 940             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
 941                 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
 942                 return {}
 943             original_lang = original_lang_node.attrib['lang_code']
 944
 945             sub_lang_list = {}
 946             for lang_node in caption_list.findall('target'):
 947                 sub_lang = lang_node.attrib['lang_code']
 948                 params = compat_urllib_parse.urlencode({
 949                     'lang': original_lang,
 950                     'tlang': sub_lang,
 951                     'fmt': sub_format,
 952                     'ts': timestamp,
 953                     'kind': 'asr',
 954                 })
 955                 sub_lang_list[sub_lang] = caption_url + '&' + params
 956             return sub_lang_list
 957         # An extractor error can be raise by the download process if there are
 958         # no automatic captions but there are subtitles
 959         except (KeyError, ExtractorError):
 960             self._downloader.report_warning(err_msg)
 961             return {}
 962
 963     @classmethod
 964     def extract_id(cls, url):
 965         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 966         if mobj is None:
 967             raise ExtractorError(u'Invalid URL: %s' % url)
 968         video_id = mobj.group(2)
 969         return video_id
 970
 971     def _extract_from_m3u8(self, manifest_url, video_id):
 972         url_map = {}
 973         def _get_urls(_manifest):
 974             lines = _manifest.split('\n')
 975             urls = filter(lambda l: l and not l.startswith('#'),
 976                             lines)
 977             return urls
 978         manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
 979         formats_urls = _get_urls(manifest)
 980         for format_url in formats_urls:
 981             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 982             url_map[itag] = format_url
 983         return url_map
 984
 985     def _extract_annotations(self, video_id):
 986         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 987         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
 988
 989     def _real_extract(self, url):
 990         proto = (
 991             u'http' if self._downloader.params.get('prefer_insecure', False)
 992             else u'https')
 993
 994         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 995         mobj = re.search(self._NEXT_URL_RE, url)
 996         if mobj:
 997             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 998         video_id = self.extract_id(url)
 999
1000         # Get video webpage
1001         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1002         video_webpage = self._download_webpage(url, video_id)
1003
1004         # Attempt to extract SWF player URL
1005         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1006         if mobj is not None:
1007             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1008         else:
1009             player_url = None
1010
1011         # Get video info
1012         self.report_video_info_webpage_download(video_id)
1013         if re.search(r'player-age-gate-content">', video_webpage) is not None:
1014             self.report_age_confirmation()
1015             age_gate = True
1016             # We simulate the access to the video from www.youtube.com/v/{video_id}
1017             # this can be viewed without login into Youtube
1018             data = compat_urllib_parse.urlencode({'video_id': video_id,
1019                                                   'el': 'player_embedded',
1020                                                   'gl': 'US',
1021                                                   'hl': 'en',
1022                                                   'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1023                                                   'asv': 3,
1024                                                   'sts':'1588',
1025                                                   })
1026             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1027             video_info_webpage = self._download_webpage(video_info_url, video_id,
1028                                     note=False,
1029                                     errnote='unable to download video info webpage')
1030             video_info = compat_parse_qs(video_info_webpage)
1031         else:
1032             age_gate = False
1033             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1034                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1035                         % (video_id, el_type))
1036                 video_info_webpage = self._download_webpage(video_info_url, video_id,
1037                                         note=False,
1038                                         errnote='unable to download video info webpage')
1039                 video_info = compat_parse_qs(video_info_webpage)
1040                 if 'token' in video_info:
1041                     break
1042         if 'token' not in video_info:
1043             if 'reason' in video_info:
1044                 raise ExtractorError(
1045                     u'YouTube said: %s' % video_info['reason'][0],
1046                     expected=True, video_id=video_id)
1047             else:
1048                 raise ExtractorError(
1049                     u'"token" parameter not in video info for unknown reason',
1050                     video_id=video_id)
1051
1052         if 'view_count' in video_info:
1053             view_count = int(video_info['view_count'][0])
1054         else:
1055             view_count = None
1056
1057         # Check for "rental" videos
1058         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1059             raise ExtractorError(u'"rental" videos not supported')
1060
1061         # Start extracting information
1062         self.report_information_extraction(video_id)
1063
1064         # uploader
1065         if 'author' not in video_info:
1066             raise ExtractorError(u'Unable to extract uploader name')
1067         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1068
1069         # uploader_id
1070         video_uploader_id = None
1071         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1072         if mobj is not None:
1073             video_uploader_id = mobj.group(1)
1074         else:
1075             self._downloader.report_warning(u'unable to extract uploader nickname')
1076
1077         # title
1078         if 'title' in video_info:
1079             video_title = video_info['title'][0]
1080         else:
1081             self._downloader.report_warning(u'Unable to extract video title')
1082             video_title = u'_'
1083
1084         # thumbnail image
1085         # We try first to get a high quality image:
1086         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1087                             video_webpage, re.DOTALL)
1088         if m_thumb is not None:
1089             video_thumbnail = m_thumb.group(1)
1090         elif 'thumbnail_url' not in video_info:
1091             self._downloader.report_warning(u'unable to extract video thumbnail')
1092             video_thumbnail = None
1093         else:   # don't panic if we can't find it
1094             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1095
1096         # upload date
1097         upload_date = None
1098         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
1099         if mobj is None:
1100             mobj = re.search(
1101                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
1102                 video_webpage)
1103         if mobj is not None:
1104             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1105             upload_date = unified_strdate(upload_date)
1106
1107         m_cat_container = get_element_by_id("eow-category", video_webpage)
1108         if m_cat_container:
1109             category = self._html_search_regex(
1110                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1111                 default=None)
1112             video_categories = None if category is None else [category]
1113         else:
1114             video_categories = None
1115
1116         # description
1117         video_description = get_element_by_id("eow-description", video_webpage)
1118         if video_description:
1119             video_description = re.sub(r'''(?x)
1120                 <a\s+
1121                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1122                     title="([^"]+)"\s+
1123                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1124                     class="yt-uix-redirect-link"\s*>
1125                 [^<]+
1126                 </a>
1127             ''', r'\1', video_description)
1128             video_description = clean_html(video_description)
1129         else:
1130             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1131             if fd_mobj:
1132                 video_description = unescapeHTML(fd_mobj.group(1))
1133             else:
1134                 video_description = u''
1135
1136         def _extract_count(klass):
1137             count = self._search_regex(
1138                 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1139                 video_webpage, klass, default=None)
1140             if count is not None:
1141                 return int(count.replace(',', ''))
1142             return None
1143         like_count = _extract_count(u'likes-count')
1144         dislike_count = _extract_count(u'dislikes-count')
1145
1146         # subtitles
1147         video_subtitles = self.extract_subtitles(video_id, video_webpage)
1148
1149         if self._downloader.params.get('listsubtitles', False):
1150             self._list_available_subtitles(video_id, video_webpage)
1151             return
1152
1153         if 'length_seconds' not in video_info:
1154             self._downloader.report_warning(u'unable to extract video duration')
1155             video_duration = None
1156         else:
1157             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1158
1159         # annotations
1160         video_annotations = None
1161         if self._downloader.params.get('writeannotations', False):
1162                 video_annotations = self._extract_annotations(video_id)
1163
1164         # Decide which formats to download
1165         try:
1166             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1167             if not mobj:
1168                 raise ValueError('Could not find vevo ID')
1169             json_code = uppercase_escape(mobj.group(1))
1170             ytplayer_config = json.loads(json_code)
1171             args = ytplayer_config['args']
1172             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1173             # this signatures are encrypted
1174             if 'url_encoded_fmt_stream_map' not in args:
1175                 raise ValueError(u'No stream_map present')  # caught below
1176             re_signature = re.compile(r'[&,]s=')
1177             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1178             if m_s is not None:
1179                 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1180                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1181             m_s = re_signature.search(args.get('adaptive_fmts', u''))
1182             if m_s is not None:
1183                 if 'adaptive_fmts' in video_info:
1184                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1185                 else:
1186                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1187         except ValueError:
1188             pass
1189
1190         def _map_to_format_list(urlmap):
1191             formats = []
1192             for itag, video_real_url in urlmap.items():
1193                 dct = {
1194                     'format_id': itag,
1195                     'url': video_real_url,
1196                     'player_url': player_url,
1197                 }
1198                 if itag in self._formats:
1199                     dct.update(self._formats[itag])
1200                 formats.append(dct)
1201             return formats
1202
1203         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1204             self.report_rtmp_download()
1205             formats = [{
1206                 'format_id': '_rtmp',
1207                 'protocol': 'rtmp',
1208                 'url': video_info['conn'][0],
1209                 'player_url': player_url,
1210             }]
1211         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1212             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1213             if 'rtmpe%3Dyes' in encoded_url_map:
1214                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1215             url_map = {}
1216             for url_data_str in encoded_url_map.split(','):
1217                 url_data = compat_parse_qs(url_data_str)
1218                 if 'itag' in url_data and 'url' in url_data:
1219                     url = url_data['url'][0]
1220                     if 'sig' in url_data:
1221                         url += '&signature=' + url_data['sig'][0]
1222                     elif 's' in url_data:
1223                         encrypted_sig = url_data['s'][0]
1224
1225                         if not age_gate:
1226                             jsplayer_url_json = self._search_regex(
1227                                 r'"assets":.+?"js":\s*("[^"]+")',
1228                                 video_webpage, u'JS player URL')
1229                             player_url = json.loads(jsplayer_url_json)
1230                         if player_url is None:
1231                             player_url_json = self._search_regex(
1232                                 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1233                                 video_webpage, u'age gate player URL')
1234                             player_url = json.loads(player_url_json)
1235
1236                         if self._downloader.params.get('verbose'):
1237                             if player_url is None:
1238                                 player_version = 'unknown'
1239                                 player_desc = 'unknown'
1240                             else:
1241                                 if player_url.endswith('swf'):
1242                                     player_version = self._search_regex(
1243                                         r'-(.+)\.swf$', player_url,
1244                                         u'flash player', fatal=False)
1245                                     player_desc = 'flash player %s' % player_version
1246                                 else:
1247                                     player_version = self._search_regex(
1248                                         r'html5player-(.+?)\.js', video_webpage,
1249                                         'html5 player', fatal=False)
1250                                     player_desc = u'html5 player %s' % player_version
1251
1252                             parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1253                             self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1254                                 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1255
1256                         signature = self._decrypt_signature(
1257                             encrypted_sig, video_id, player_url, age_gate)
1258                         url += '&signature=' + signature
1259                     if 'ratebypass' not in url:
1260                         url += '&ratebypass=yes'
1261                     url_map[url_data['itag'][0]] = url
1262             formats = _map_to_format_list(url_map)
1263         elif video_info.get('hlsvp'):
1264             manifest_url = video_info['hlsvp'][0]
1265             url_map = self._extract_from_m3u8(manifest_url, video_id)
1266             formats = _map_to_format_list(url_map)
1267         else:
1268             raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1269
1270         # Look for the DASH manifest
1271         if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1272             try:
1273                 # The DASH manifest used needs to be the one from the original video_webpage.
1274                 # The one found in get_video_info seems to be using different signatures.
1275                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1276                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1277                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1278                 if age_gate:
1279                     dash_manifest_url = video_info.get('dashmpd')[0]
1280                 else:
1281                     dash_manifest_url = ytplayer_config['args']['dashmpd']
1282                 def decrypt_sig(mobj):
1283                     s = mobj.group(1)
1284                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1285                     return '/signature/%s' % dec_s
1286                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1287                 dash_doc = self._download_xml(
1288                     dash_manifest_url, video_id,
1289                     note=u'Downloading DASH manifest',
1290                     errnote=u'Could not download DASH manifest')
1291                 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1292                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1293                     if url_el is None:
1294                         continue
1295                     format_id = r.attrib['id']
1296                     video_url = url_el.text
1297                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1298                     f = {
1299                         'format_id': format_id,
1300                         'url': video_url,
1301                         'width': int_or_none(r.attrib.get('width')),
1302                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1303                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1304                         'filesize': filesize,
1305                     }
1306                     try:
1307                         existing_format = next(
1308                             fo for fo in formats
1309                             if fo['format_id'] == format_id)
1310                     except StopIteration:
1311                         f.update(self._formats.get(format_id, {}))
1312                         formats.append(f)
1313                     else:
1314                         existing_format.update(f)
1315
1316             except (ExtractorError, KeyError) as e:
1317                 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1318
1319         self._sort_formats(formats)
1320
1321         return {
1322             'id':           video_id,
1323             'uploader':     video_uploader,
1324             'uploader_id':  video_uploader_id,
1325             'upload_date':  upload_date,
1326             'title':        video_title,
1327             'thumbnail':    video_thumbnail,
1328             'description':  video_description,
1329             'categories':   video_categories,
1330             'subtitles':    video_subtitles,
1331             'duration':     video_duration,
1332             'age_limit':    18 if age_gate else 0,
1333             'annotations':  video_annotations,
1334             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1335             'view_count':   view_count,
1336             'like_count': like_count,
1337             'dislike_count': dislike_count,
1338             'formats':      formats,
1339         }
1340
1341 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1342     IE_DESC = u'YouTube.com playlists'
1343     _VALID_URL = r"""(?x)(?:
1344                         (?:https?://)?
1345                         (?:\w+\.)?
1346                         youtube\.com/
1347                         (?:
1348                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1349                            \? (?:.*?&)*? (?:p|a|list)=
1350                         |  p/
1351                         )
1352                         (
1353                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1354                             # Top tracks, they can also include dots
1355                             |(?:MC)[\w\.]*
1356                         )
1357                         .*
1358                      |
1359                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1360                      )"""
1361     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1362     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1363     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1364     IE_NAME = u'youtube:playlist'
1365
1366     def _real_initialize(self):
1367         self._login()
1368
1369     def _ids_to_results(self, ids):
1370         return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1371                        for vid_id in ids]
1372
1373     def _extract_mix(self, playlist_id):
1374         # The mixes are generated from a a single video
1375         # the id of the playlist is just 'RD' + video_id
1376         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1377         webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1378         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1379         title_span = (search_title('playlist-title') or
1380             search_title('title long-title') or search_title('title'))
1381         title = clean_html(title_span)
1382         video_re = r'''(?x)data-video-username=".*?".*?
1383                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1384         ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1385         url_results = self._ids_to_results(ids)
1386
1387         return self.playlist_result(url_results, playlist_id, title)
1388
1389     def _real_extract(self, url):
1390         # Extract playlist id
1391         mobj = re.match(self._VALID_URL, url)
1392         if mobj is None:
1393             raise ExtractorError(u'Invalid URL: %s' % url)
1394         playlist_id = mobj.group(1) or mobj.group(2)
1395
1396         # Check if it's a video-specific URL
1397         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1398         if 'v' in query_dict:
1399             video_id = query_dict['v'][0]
1400             if self._downloader.params.get('noplaylist'):
1401                 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1402                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1403             else:
1404                 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1405
1406         if playlist_id.startswith('RD'):
1407             # Mixes require a custom extraction process
1408             return self._extract_mix(playlist_id)
1409         if playlist_id.startswith('TL'):
1410             raise ExtractorError(u'For downloading YouTube.com top lists, use '
1411                 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1412
1413         url = self._TEMPLATE_URL % playlist_id
1414         page = self._download_webpage(url, playlist_id)
1415         more_widget_html = content_html = page
1416
1417         # Check if the playlist exists or is private
1418         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1419             raise ExtractorError(
1420                 u'The playlist doesn\'t exist or is private, use --username or '
1421                 '--netrc to access it.',
1422                 expected=True)
1423
1424         # Extract the video ids from the playlist pages
1425         ids = []
1426
1427         for page_num in itertools.count(1):
1428             matches = re.finditer(self._VIDEO_RE, content_html)
1429             # We remove the duplicates and the link with index 0
1430             # (it's not the first video of the playlist)
1431             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1432             ids.extend(new_ids)
1433
1434             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1435             if not mobj:
1436                 break
1437
1438             more = self._download_json(
1439                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1440                 'Downloading page #%s' % page_num,
1441                 transform_source=uppercase_escape)
1442             content_html = more['content_html']
1443             more_widget_html = more['load_more_widget_html']
1444
1445         playlist_title = self._html_search_regex(
1446             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1447             page, u'title')
1448
1449         url_results = self._ids_to_results(ids)
1450         return self.playlist_result(url_results, playlist_id, playlist_title)
1451
1452
1453 class YoutubeTopListIE(YoutubePlaylistIE):
1454     IE_NAME = u'youtube:toplist'
1455     IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1456         u' (Example: "yttoplist:music:Top Tracks")')
1457     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1458
1459     def _real_extract(self, url):
1460         mobj = re.match(self._VALID_URL, url)
1461         channel = mobj.group('chann')
1462         title = mobj.group('title')
1463         query = compat_urllib_parse.urlencode({'title': title})
1464         playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1465         channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1466         link = self._html_search_regex(playlist_re, channel_page, u'list')
1467         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1468
1469         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1470         ids = []
1471         # sometimes the webpage doesn't contain the videos
1472         # retry until we get them
1473         for i in itertools.count(0):
1474             msg = u'Downloading Youtube mix'
1475             if i > 0:
1476                 msg += ', retry #%d' % i
1477             webpage = self._download_webpage(url, title, msg)
1478             ids = orderedSet(re.findall(video_re, webpage))
1479             if ids:
1480                 break
1481         url_results = self._ids_to_results(ids)
1482         return self.playlist_result(url_results, playlist_title=title)
1483
1484
1485 class YoutubeChannelIE(InfoExtractor):
1486     IE_DESC = u'YouTube.com channels'
1487     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1488     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1489     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1490     IE_NAME = u'youtube:channel'
1491
1492     def extract_videos_from_page(self, page):
1493         ids_in_page = []
1494         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1495             if mobj.group(1) not in ids_in_page:
1496                 ids_in_page.append(mobj.group(1))
1497         return ids_in_page
1498
1499     def _real_extract(self, url):
1500         # Extract channel id
1501         mobj = re.match(self._VALID_URL, url)
1502         if mobj is None:
1503             raise ExtractorError(u'Invalid URL: %s' % url)
1504
1505         # Download channel page
1506         channel_id = mobj.group(1)
1507         video_ids = []
1508         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1509         channel_page = self._download_webpage(url, channel_id)
1510         autogenerated = re.search(r'''(?x)
1511                 class="[^"]*?(?:
1512                     channel-header-autogenerated-label|
1513                     yt-channel-title-autogenerated
1514                 )[^"]*"''', channel_page) is not None
1515
1516         if autogenerated:
1517             # The videos are contained in a single page
1518             # the ajax pages can't be used, they are empty
1519             video_ids = self.extract_videos_from_page(channel_page)
1520         else:
1521             # Download all channel pages using the json-based channel_ajax query
1522             for pagenum in itertools.count(1):
1523                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1524                 page = self._download_json(
1525                     url, channel_id, note=u'Downloading page #%s' % pagenum,
1526                     transform_source=uppercase_escape)
1527
1528                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1529                 video_ids.extend(ids_in_page)
1530
1531                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1532                     break
1533
1534         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1535
1536         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1537                        for video_id in video_ids]
1538         return self.playlist_result(url_entries, channel_id)
1539
1540
1541 class YoutubeUserIE(InfoExtractor):
1542     IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1543     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1544     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1545     _GDATA_PAGE_SIZE = 50
1546     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1547     IE_NAME = u'youtube:user'
1548
1549     @classmethod
1550     def suitable(cls, url):
1551         # Don't return True if the url can be extracted with other youtube
1552         # extractor, the regex would is too permissive and it would match.
1553         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1554         if any(ie.suitable(url) for ie in other_ies): return False
1555         else: return super(YoutubeUserIE, cls).suitable(url)
1556
1557     def _real_extract(self, url):
1558         # Extract username
1559         mobj = re.match(self._VALID_URL, url)
1560         if mobj is None:
1561             raise ExtractorError(u'Invalid URL: %s' % url)
1562
1563         username = mobj.group(1)
1564
1565         # Download video ids using YouTube Data API. Result size per
1566         # query is limited (currently to 50 videos) so we need to query
1567         # page by page until there are no video ids - it means we got
1568         # all of them.
1569
1570         def download_page(pagenum):
1571             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1572
1573             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1574             page = self._download_webpage(
1575                 gdata_url, username,
1576                 u'Downloading video ids from %d to %d' % (
1577                     start_index, start_index + self._GDATA_PAGE_SIZE))
1578
1579             try:
1580                 response = json.loads(page)
1581             except ValueError as err:
1582                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1583             if 'entry' not in response['feed']:
1584                 return
1585
1586             # Extract video identifiers
1587             entries = response['feed']['entry']
1588             for entry in entries:
1589                 title = entry['title']['$t']
1590                 video_id = entry['id']['$t'].split('/')[-1]
1591                 yield {
1592                     '_type': 'url',
1593                     'url': video_id,
1594                     'ie_key': 'Youtube',
1595                     'id': video_id,
1596                     'title': title,
1597                 }
1598         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1599
1600         return self.playlist_result(url_results, playlist_title=username)
1601
1602
1603 class YoutubeSearchIE(SearchInfoExtractor):
1604     IE_DESC = u'YouTube.com searches'
1605     _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1606     _MAX_RESULTS = 1000
1607     IE_NAME = u'youtube:search'
1608     _SEARCH_KEY = 'ytsearch'
1609
1610     def _get_n_results(self, query, n):
1611         """Get a specified number of results for a query"""
1612
1613         video_ids = []
1614         pagenum = 0
1615         limit = n
1616         PAGE_SIZE = 50
1617
1618         while (PAGE_SIZE * pagenum) < limit:
1619             result_url = self._API_URL % (
1620                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1621                 (PAGE_SIZE * pagenum) + 1)
1622             data_json = self._download_webpage(
1623                 result_url, video_id=u'query "%s"' % query,
1624                 note=u'Downloading page %s' % (pagenum + 1),
1625                 errnote=u'Unable to download API page')
1626             data = json.loads(data_json)
1627             api_response = data['data']
1628
1629             if 'items' not in api_response:
1630                 raise ExtractorError(
1631                     u'[youtube] No video results', expected=True)
1632
1633             new_ids = list(video['id'] for video in api_response['items'])
1634             video_ids += new_ids
1635
1636             limit = min(n, api_response['totalItems'])
1637             pagenum += 1
1638
1639         if len(video_ids) > n:
1640             video_ids = video_ids[:n]
1641         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1642                   for video_id in video_ids]
1643         return self.playlist_result(videos, query)
1644
1645
1646 class YoutubeSearchDateIE(YoutubeSearchIE):
1647     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1648     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1649     _SEARCH_KEY = 'ytsearchdate'
1650     IE_DESC = u'YouTube.com searches, newest videos first'
1651
1652
1653 class YoutubeSearchURLIE(InfoExtractor):
1654     IE_DESC = u'YouTube.com search URLs'
1655     IE_NAME = u'youtube:search_url'
1656     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1657
1658     def _real_extract(self, url):
1659         mobj = re.match(self._VALID_URL, url)
1660         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1661
1662         webpage = self._download_webpage(url, query)
1663         result_code = self._search_regex(
1664             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
1665
1666         part_codes = re.findall(
1667             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1668         entries = []
1669         for part_code in part_codes:
1670             part_title = self._html_search_regex(
1671                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1672             part_url_snippet = self._html_search_regex(
1673                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1674             part_url = compat_urlparse.urljoin(
1675                 'https://www.youtube.com/', part_url_snippet)
1676             entries.append({
1677                 '_type': 'url',
1678                 'url': part_url,
1679                 'title': part_title,
1680             })
1681
1682         return {
1683             '_type': 'playlist',
1684             'entries': entries,
1685             'title': query,
1686         }
1687
1688
1689 class YoutubeShowIE(InfoExtractor):
1690     IE_DESC = u'YouTube.com (multi-season) shows'
1691     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1692     IE_NAME = u'youtube:show'
1693
1694     def _real_extract(self, url):
1695         mobj = re.match(self._VALID_URL, url)
1696         show_name = mobj.group(1)
1697         webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1698         # There's one playlist for each season of the show
1699         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1700         self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1701         return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1702
1703
1704 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1705     """
1706     Base class for extractors that fetch info from
1707     http://www.youtube.com/feed_ajax
1708     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1709     """
1710     _LOGIN_REQUIRED = True
1711     # use action_load_personal_feed instead of action_load_system_feed
1712     _PERSONAL_FEED = False
1713
1714     @property
1715     def _FEED_TEMPLATE(self):
1716         action = 'action_load_system_feed'
1717         if self._PERSONAL_FEED:
1718             action = 'action_load_personal_feed'
1719         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1720
1721     @property
1722     def IE_NAME(self):
1723         return u'youtube:%s' % self._FEED_NAME
1724
1725     def _real_initialize(self):
1726         self._login()
1727
1728     def _real_extract(self, url):
1729         feed_entries = []
1730         paging = 0
1731         for i in itertools.count(1):
1732             info = self._download_json(self._FEED_TEMPLATE % paging,
1733                                           u'%s feed' % self._FEED_NAME,
1734                                           u'Downloading page %s' % i)
1735             feed_html = info.get('feed_html') or info.get('content_html')
1736             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1737             ids = orderedSet(m.group(1) for m in m_ids)
1738             feed_entries.extend(
1739                 self.url_result(video_id, 'Youtube', video_id=video_id)
1740                 for video_id in ids)
1741             mobj = re.search(
1742                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1743                 feed_html)
1744             if mobj is None:
1745                 break
1746             paging = mobj.group('paging')
1747         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1748
1749 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1750     IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1751     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1752     _FEED_NAME = 'subscriptions'
1753     _PLAYLIST_TITLE = u'Youtube Subscriptions'
1754
1755 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1756     IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1757     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1758     _FEED_NAME = 'recommended'
1759     _PLAYLIST_TITLE = u'Youtube Recommended videos'
1760
1761 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1762     IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1763     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1764     _FEED_NAME = 'watch_later'
1765     _PLAYLIST_TITLE = u'Youtube Watch Later'
1766     _PERSONAL_FEED = True
1767
1768 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1769     IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1770     _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1771     _FEED_NAME = 'history'
1772     _PERSONAL_FEED = True
1773     _PLAYLIST_TITLE = u'Youtube Watch History'
1774
1775 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1776     IE_NAME = u'youtube:favorites'
1777     IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1778     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1779     _LOGIN_REQUIRED = True
1780
1781     def _real_extract(self, url):
1782         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1783         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1784         return self.url_result(playlist_id, 'YoutubePlaylist')
1785
1786
1787 class YoutubeTruncatedURLIE(InfoExtractor):
1788     IE_NAME = 'youtube:truncated_url'
1789     IE_DESC = False  # Do not list
1790     _VALID_URL = r'''(?x)
1791         (?:https?://)?[^/]+/watch\?(?:
1792             feature=[a-z_]+|
1793             annotation_id=annotation_[^&]+
1794         )?$|
1795         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1796     '''
1797
1798     _TESTS = [{
1799         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1800         'only_matching': True,
1801     }, {
1802         'url': 'http://www.youtube.com/watch?',
1803         'only_matching': True,
1804     }]
1805
1806     def _real_extract(self, url):
1807         raise ExtractorError(
1808             u'Did you forget to quote the URL? Remember that & is a meta '
1809             u'character in most shells, so you want to put the URL in quotes, '
1810             u'like  youtube-dl '
1811             u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1812             u' or simply  youtube-dl BaW_jenozKc  .',
1813             expected=True)