_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 import collections
   4 import errno
   5 import io
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import struct
  11 import traceback
  12 import zlib
  13
  14 from .common import InfoExtractor, SearchInfoExtractor
  15 from .subtitles import SubtitlesInfoExtractor
  16 from ..jsinterp import JSInterpreter
  17 from ..utils import (
  18     compat_chr,
  19     compat_parse_qs,
  20     compat_urllib_parse,
  21     compat_urllib_request,
  22     compat_urlparse,
  23     compat_str,
  24
  25     clean_html,
  26     get_cachedir,
  27     get_element_by_id,
  28     get_element_by_attribute,
  29     ExtractorError,
  30     int_or_none,
  31     PagedList,
  32     unescapeHTML,
  33     unified_strdate,
  34     orderedSet,
  35     write_json_file,
  36     uppercase_escape,
  37 )
  38
  39 class YoutubeBaseInfoExtractor(InfoExtractor):
  40     """Provide base functions for Youtube extractors"""
  41     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  42     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  43     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  44     _NETRC_MACHINE = 'youtube'
  45     # If True it will raise an error if no login info is provided
  46     _LOGIN_REQUIRED = False
  47
  48     def _set_language(self):
  49         return bool(self._download_webpage(
  50             self._LANG_URL, None,
  51             note=u'Setting language', errnote='unable to set language',
  52             fatal=False))
  53
  54     def _login(self):
  55         (username, password) = self._get_login_info()
  56         # No authentication to be performed
  57         if username is None:
  58             if self._LOGIN_REQUIRED:
  59                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  60             return False
  61
  62         login_page = self._download_webpage(
  63             self._LOGIN_URL, None,
  64             note=u'Downloading login page',
  65             errnote=u'unable to fetch login page', fatal=False)
  66         if login_page is False:
  67             return
  68
  69         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  70                                   login_page, u'Login GALX parameter')
  71
  72         # Log in
  73         login_form_strs = {
  74                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  75                 u'Email': username,
  76                 u'GALX': galx,
  77                 u'Passwd': password,
  78                 u'PersistentCookie': u'yes',
  79                 u'_utf8': u'霱',
  80                 u'bgresponse': u'js_disabled',
  81                 u'checkConnection': u'',
  82                 u'checkedDomains': u'youtube',
  83                 u'dnConn': u'',
  84                 u'pstMsg': u'0',
  85                 u'rmShown': u'1',
  86                 u'secTok': u'',
  87                 u'signIn': u'Sign in',
  88                 u'timeStmp': u'',
  89                 u'service': u'youtube',
  90                 u'uilel': u'3',
  91                 u'hl': u'en_US',
  92         }
  93         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
  94         # chokes on unicode
  95         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
  96         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
  97
  98         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
  99         login_results = self._download_webpage(
 100             req, None,
 101             note=u'Logging in', errnote=u'unable to log in', fatal=False)
 102         if login_results is False:
 103             return False
 104         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 105             self._downloader.report_warning(u'unable to log in: bad username or password')
 106             return False
 107         return True
 108
 109     def _confirm_age(self):
 110         age_form = {
 111             'next_url': '/',
 112             'action_confirm': 'Confirm',
 113         }
 114         req = compat_urllib_request.Request(self._AGE_URL,
 115             compat_urllib_parse.urlencode(age_form).encode('ascii'))
 116
 117         self._download_webpage(
 118             req, None,
 119             note=u'Confirming age', errnote=u'Unable to confirm age')
 120         return True
 121
 122     def _real_initialize(self):
 123         if self._downloader is None:
 124             return
 125         if not self._set_language():
 126             return
 127         if not self._login():
 128             return
 129         self._confirm_age()
 130
 131
 132 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 133     IE_DESC = u'YouTube.com'
 134     _VALID_URL = r"""(?x)^
 135                      (
 136                          (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
 137                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 138                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 139                             (?:www\.)?pwnyoutube\.com/|
 140                             (?:www\.)?yourepeat\.com/|
 141                             tube\.majestyc\.net/|
 142                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 143                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 144                          (?:                                                  # the various things that can precede the ID:
 145                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 146                              |(?:                                             # or the v= param in all its forms
 147                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 148                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 149                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 150                                  v=
 151                              )
 152                          ))
 153                          |youtu\.be/                                          # just youtu.be/xxxx
 154                          |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 155                          )
 156                      )?                                                       # all until now is optional -> you can pass the naked ID
 157                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 158                      (?(1).+)?                                                # if we found the ID, everything can follow
 159                      $"""
 160     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 161     _formats = {
 162         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 163         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 164         '13': {'ext': '3gp'},
 165         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 166         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 167         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 168         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 169         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 170         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 171         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 172         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 173         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 174         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 175         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 176         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 177
 178
 179         # 3d videos
 180         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 181         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 182         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 183         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 184         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 185         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 186         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 187
 188         # Apple HTTP Live Streaming
 189         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 190         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 191         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 192         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 193         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 194         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 195         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 196
 197         # DASH mp4 video
 198         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 199         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 200         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 201         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 202         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 203         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 204         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 205         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 206
 207         # Dash mp4 audio
 208         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 209         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 210         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 211
 212         # Dash webm
 213         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 214         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 215         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 216         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 217         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 218         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 219         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 220         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 221         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 222         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 223         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 224         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 225         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 226
 227         # Dash webm audio
 228         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
 229         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 230
 231         # RTMP (unnamed)
 232         '_rtmp': {'protocol': 'rtmp'},
 233     }
 234
 235     IE_NAME = u'youtube'
 236     _TESTS = [
 237         {
 238             u"url":  u"http://www.youtube.com/watch?v=BaW_jenozKc",
 239             u"file":  u"BaW_jenozKc.mp4",
 240             u"info_dict": {
 241                 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
 242                 u"uploader": u"Philipp Hagemeister",
 243                 u"uploader_id": u"phihag",
 244                 u"upload_date": u"20121002",
 245                 u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
 246                 u"categories": [u'Science & Technology'],
 247             }
 248         },
 249         {
 250             u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
 251             u"file":  u"UxxajLWwzqY.mp4",
 252             u"note": u"Test generic use_cipher_signature video (#897)",
 253             u"info_dict": {
 254                 u"upload_date": u"20120506",
 255                 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
 256                 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
 257                 u"uploader": u"Icona Pop",
 258                 u"uploader_id": u"IconaPop"
 259             }
 260         },
 261         {
 262             u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
 263             u"file":  u"07FYdnEawAQ.mp4",
 264             u"note": u"Test VEVO video with age protection (#956)",
 265             u"info_dict": {
 266                 u"upload_date": u"20130703",
 267                 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
 268                 u"description": u"md5:64249768eec3bc4276236606ea996373",
 269                 u"uploader": u"justintimberlakeVEVO",
 270                 u"uploader_id": u"justintimberlakeVEVO"
 271             }
 272         },
 273         {
 274             u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
 275             u"file":  u"yZIXLfi8CZQ.mp4",
 276             u"note": u"Embed-only video (#1746)",
 277             u"info_dict": {
 278                 u"upload_date": u"20120608",
 279                 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
 280                 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
 281                 u"uploader": u"SET India",
 282                 u"uploader_id": u"setindia"
 283             }
 284         },
 285         {
 286             u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
 287             u"file": u"a9LDPn-MO4I.m4a",
 288             u"note": u"256k DASH audio (format 141) via DASH manifest",
 289             u"info_dict": {
 290                 u"upload_date": "20121002",
 291                 u"uploader_id": "8KVIDEO",
 292                 u"description": "No description available.",
 293                 u"uploader": "8KVIDEO",
 294                 u"title": "UHDTV TEST 8K VIDEO.mp4"
 295             },
 296             u"params": {
 297                 u"youtube_include_dash_manifest": True,
 298                 u"format": "141",
 299             },
 300         },
 301         # DASH manifest with encrypted signature
 302         {
 303             u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 304             u'info_dict': {
 305                 u'id': u'IB3lcPjvWLA',
 306                 u'ext': u'm4a',
 307                 u'title': u'Afrojack - The Spark ft. Spree Wilson',
 308                 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
 309                 u'uploader': u'AfrojackVEVO',
 310                 u'uploader_id': u'AfrojackVEVO',
 311                 u'upload_date': u'20131011',
 312             },
 313             u"params": {
 314                 u'youtube_include_dash_manifest': True,
 315                 u'format': '141',
 316             },
 317         },
 318     ]
 319
 320
 321     @classmethod
 322     def suitable(cls, url):
 323         """Receives a URL and returns True if suitable for this IE."""
 324         if YoutubePlaylistIE.suitable(url): return False
 325         return re.match(cls._VALID_URL, url) is not None
 326
 327     def __init__(self, *args, **kwargs):
 328         super(YoutubeIE, self).__init__(*args, **kwargs)
 329         self._player_cache = {}
 330
 331     def report_video_info_webpage_download(self, video_id):
 332         """Report attempt to download video info webpage."""
 333         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 334
 335     def report_information_extraction(self, video_id):
 336         """Report attempt to extract video information."""
 337         self.to_screen(u'%s: Extracting video information' % video_id)
 338
 339     def report_unavailable_format(self, video_id, format):
 340         """Report extracted video URL."""
 341         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 342
 343     def report_rtmp_download(self):
 344         """Indicate the download will use the RTMP protocol."""
 345         self.to_screen(u'RTMP download detected')
 346
 347     def _extract_signature_function(self, video_id, player_url, slen):
 348         id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
 349                         player_url)
 350         player_type = id_m.group('ext')
 351         player_id = id_m.group('id')
 352
 353         # Read from filesystem cache
 354         func_id = '%s_%s_%d' % (player_type, player_id, slen)
 355         assert os.path.basename(func_id) == func_id
 356         cache_dir = get_cachedir(self._downloader.params)
 357
 358         cache_enabled = cache_dir is not None
 359         if cache_enabled:
 360             cache_fn = os.path.join(os.path.expanduser(cache_dir),
 361                                     u'youtube-sigfuncs',
 362                                     func_id + '.json')
 363             try:
 364                 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
 365                     cache_spec = json.load(cachef)
 366                 return lambda s: u''.join(s[i] for i in cache_spec)
 367             except IOError:
 368                 pass  # No cache available
 369
 370         if player_type == 'js':
 371             code = self._download_webpage(
 372                 player_url, video_id,
 373                 note=u'Downloading %s player %s' % (player_type, player_id),
 374                 errnote=u'Download of %s failed' % player_url)
 375             res = self._parse_sig_js(code)
 376         elif player_type == 'swf':
 377             urlh = self._request_webpage(
 378                 player_url, video_id,
 379                 note=u'Downloading %s player %s' % (player_type, player_id),
 380                 errnote=u'Download of %s failed' % player_url)
 381             code = urlh.read()
 382             res = self._parse_sig_swf(code)
 383         else:
 384             assert False, 'Invalid player type %r' % player_type
 385
 386         if cache_enabled:
 387             try:
 388                 test_string = u''.join(map(compat_chr, range(slen)))
 389                 cache_res = res(test_string)
 390                 cache_spec = [ord(c) for c in cache_res]
 391                 try:
 392                     os.makedirs(os.path.dirname(cache_fn))
 393                 except OSError as ose:
 394                     if ose.errno != errno.EEXIST:
 395                         raise
 396                 write_json_file(cache_spec, cache_fn)
 397             except Exception:
 398                 tb = traceback.format_exc()
 399                 self._downloader.report_warning(
 400                     u'Writing cache to %r failed: %s' % (cache_fn, tb))
 401
 402         return res
 403
 404     def _print_sig_code(self, func, slen):
 405         def gen_sig_code(idxs):
 406             def _genslice(start, end, step):
 407                 starts = u'' if start == 0 else str(start)
 408                 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
 409                 steps = u'' if step == 1 else (u':%d' % step)
 410                 return u's[%s%s%s]' % (starts, ends, steps)
 411
 412             step = None
 413             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 414                                     # set as soon as step is set
 415             for i, prev in zip(idxs[1:], idxs[:-1]):
 416                 if step is not None:
 417                     if i - prev == step:
 418                         continue
 419                     yield _genslice(start, prev, step)
 420                     step = None
 421                     continue
 422                 if i - prev in [-1, 1]:
 423                     step = i - prev
 424                     start = prev
 425                     continue
 426                 else:
 427                     yield u's[%d]' % prev
 428             if step is None:
 429                 yield u's[%d]' % i
 430             else:
 431                 yield _genslice(start, i, step)
 432
 433         test_string = u''.join(map(compat_chr, range(slen)))
 434         cache_res = func(test_string)
 435         cache_spec = [ord(c) for c in cache_res]
 436         expr_code = u' + '.join(gen_sig_code(cache_spec))
 437         code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code)
 438         self.to_screen(u'Extracted signature function:\n' + code)
 439
 440     def _parse_sig_js(self, jscode):
 441         funcname = self._search_regex(
 442             r'signature=([a-zA-Z]+)', jscode,
 443              u'Initial JS player signature function name')
 444
 445         jsi = JSInterpreter(jscode)
 446         initial_function = jsi.extract_function(funcname)
 447         return lambda s: initial_function([s])
 448
 449     def _parse_sig_swf(self, file_contents):
 450         if file_contents[1:3] != b'WS':
 451             raise ExtractorError(
 452                 u'Not an SWF file; header is %r' % file_contents[:3])
 453         if file_contents[:1] == b'C':
 454             content = zlib.decompress(file_contents[8:])
 455         else:
 456             raise NotImplementedError(u'Unsupported compression format %r' %
 457                                       file_contents[:1])
 458
 459         def extract_tags(content):
 460             pos = 0
 461             while pos < len(content):
 462                 header16 = struct.unpack('<H', content[pos:pos+2])[0]
 463                 pos += 2
 464                 tag_code = header16 >> 6
 465                 tag_len = header16 & 0x3f
 466                 if tag_len == 0x3f:
 467                     tag_len = struct.unpack('<I', content[pos:pos+4])[0]
 468                     pos += 4
 469                 assert pos+tag_len <= len(content)
 470                 yield (tag_code, content[pos:pos+tag_len])
 471                 pos += tag_len
 472
 473         code_tag = next(tag
 474                         for tag_code, tag in extract_tags(content)
 475                         if tag_code == 82)
 476         p = code_tag.index(b'\0', 4) + 1
 477         code_reader = io.BytesIO(code_tag[p:])
 478
 479         # Parse ABC (AVM2 ByteCode)
 480         def read_int(reader=None):
 481             if reader is None:
 482                 reader = code_reader
 483             res = 0
 484             shift = 0
 485             for _ in range(5):
 486                 buf = reader.read(1)
 487                 assert len(buf) == 1
 488                 b = struct.unpack('<B', buf)[0]
 489                 res = res | ((b & 0x7f) << shift)
 490                 if b & 0x80 == 0:
 491                     break
 492                 shift += 7
 493             return res
 494
 495         def u30(reader=None):
 496             res = read_int(reader)
 497             assert res & 0xf0000000 == 0
 498             return res
 499         u32 = read_int
 500
 501         def s32(reader=None):
 502             v = read_int(reader)
 503             if v & 0x80000000 != 0:
 504                 v = - ((v ^ 0xffffffff) + 1)
 505             return v
 506
 507         def read_string(reader=None):
 508             if reader is None:
 509                 reader = code_reader
 510             slen = u30(reader)
 511             resb = reader.read(slen)
 512             assert len(resb) == slen
 513             return resb.decode('utf-8')
 514
 515         def read_bytes(count, reader=None):
 516             if reader is None:
 517                 reader = code_reader
 518             resb = reader.read(count)
 519             assert len(resb) == count
 520             return resb
 521
 522         def read_byte(reader=None):
 523             resb = read_bytes(1, reader=reader)
 524             res = struct.unpack('<B', resb)[0]
 525             return res
 526
 527         # minor_version + major_version
 528         read_bytes(2 + 2)
 529
 530         # Constant pool
 531         int_count = u30()
 532         for _c in range(1, int_count):
 533             s32()
 534         uint_count = u30()
 535         for _c in range(1, uint_count):
 536             u32()
 537         double_count = u30()
 538         read_bytes((double_count-1) * 8)
 539         string_count = u30()
 540         constant_strings = [u'']
 541         for _c in range(1, string_count):
 542             s = read_string()
 543             constant_strings.append(s)
 544         namespace_count = u30()
 545         for _c in range(1, namespace_count):
 546             read_bytes(1)  # kind
 547             u30()  # name
 548         ns_set_count = u30()
 549         for _c in range(1, ns_set_count):
 550             count = u30()
 551             for _c2 in range(count):
 552                 u30()
 553         multiname_count = u30()
 554         MULTINAME_SIZES = {
 555             0x07: 2,  # QName
 556             0x0d: 2,  # QNameA
 557             0x0f: 1,  # RTQName
 558             0x10: 1,  # RTQNameA
 559             0x11: 0,  # RTQNameL
 560             0x12: 0,  # RTQNameLA
 561             0x09: 2,  # Multiname
 562             0x0e: 2,  # MultinameA
 563             0x1b: 1,  # MultinameL
 564             0x1c: 1,  # MultinameLA
 565         }
 566         multinames = [u'']
 567         for _c in range(1, multiname_count):
 568             kind = u30()
 569             assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
 570             if kind == 0x07:
 571                 u30()  # namespace_idx
 572                 name_idx = u30()
 573                 multinames.append(constant_strings[name_idx])
 574             else:
 575                 multinames.append('[MULTINAME kind: %d]' % kind)
 576                 for _c2 in range(MULTINAME_SIZES[kind]):
 577                     u30()
 578
 579         # Methods
 580         method_count = u30()
 581         MethodInfo = collections.namedtuple(
 582             'MethodInfo',
 583             ['NEED_ARGUMENTS', 'NEED_REST'])
 584         method_infos = []
 585         for method_id in range(method_count):
 586             param_count = u30()
 587             u30()  # return type
 588             for _ in range(param_count):
 589                 u30()  # param type
 590             u30()  # name index (always 0 for youtube)
 591             flags = read_byte()
 592             if flags & 0x08 != 0:
 593                 # Options present
 594                 option_count = u30()
 595                 for c in range(option_count):
 596                     u30()  # val
 597                     read_bytes(1)  # kind
 598             if flags & 0x80 != 0:
 599                 # Param names present
 600                 for _ in range(param_count):
 601                     u30()  # param name
 602             mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
 603             method_infos.append(mi)
 604
 605         # Metadata
 606         metadata_count = u30()
 607         for _c in range(metadata_count):
 608             u30()  # name
 609             item_count = u30()
 610             for _c2 in range(item_count):
 611                 u30()  # key
 612                 u30()  # value
 613
 614         def parse_traits_info():
 615             trait_name_idx = u30()
 616             kind_full = read_byte()
 617             kind = kind_full & 0x0f
 618             attrs = kind_full >> 4
 619             methods = {}
 620             if kind in [0x00, 0x06]:  # Slot or Const
 621                 u30()  # Slot id
 622                 u30()  # type_name_idx
 623                 vindex = u30()
 624                 if vindex != 0:
 625                     read_byte()  # vkind
 626             elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter
 627                 u30()  # disp_id
 628                 method_idx = u30()
 629                 methods[multinames[trait_name_idx]] = method_idx
 630             elif kind == 0x04:  # Class
 631                 u30()  # slot_id
 632                 u30()  # classi
 633             elif kind == 0x05:  # Function
 634                 u30()  # slot_id
 635                 function_idx = u30()
 636                 methods[function_idx] = multinames[trait_name_idx]
 637             else:
 638                 raise ExtractorError(u'Unsupported trait kind %d' % kind)
 639
 640             if attrs & 0x4 != 0:  # Metadata present
 641                 metadata_count = u30()
 642                 for _c3 in range(metadata_count):
 643                     u30()  # metadata index
 644
 645             return methods
 646
 647         # Classes
 648         TARGET_CLASSNAME = u'SignatureDecipher'
 649         searched_idx = multinames.index(TARGET_CLASSNAME)
 650         searched_class_id = None
 651         class_count = u30()
 652         for class_id in range(class_count):
 653             name_idx = u30()
 654             if name_idx == searched_idx:
 655                 # We found the class we're looking for!
 656                 searched_class_id = class_id
 657             u30()  # super_name idx
 658             flags = read_byte()
 659             if flags & 0x08 != 0:  # Protected namespace is present
 660                 u30()  # protected_ns_idx
 661             intrf_count = u30()
 662             for _c2 in range(intrf_count):
 663                 u30()
 664             u30()  # iinit
 665             trait_count = u30()
 666             for _c2 in range(trait_count):
 667                 parse_traits_info()
 668
 669         if searched_class_id is None:
 670             raise ExtractorError(u'Target class %r not found' %
 671                                  TARGET_CLASSNAME)
 672
 673         method_names = {}
 674         method_idxs = {}
 675         for class_id in range(class_count):
 676             u30()  # cinit
 677             trait_count = u30()
 678             for _c2 in range(trait_count):
 679                 trait_methods = parse_traits_info()
 680                 if class_id == searched_class_id:
 681                     method_names.update(trait_methods.items())
 682                     method_idxs.update(dict(
 683                         (idx, name)
 684                         for name, idx in trait_methods.items()))
 685
 686         # Scripts
 687         script_count = u30()
 688         for _c in range(script_count):
 689             u30()  # init
 690             trait_count = u30()
 691             for _c2 in range(trait_count):
 692                 parse_traits_info()
 693
 694         # Method bodies
 695         method_body_count = u30()
 696         Method = collections.namedtuple('Method', ['code', 'local_count'])
 697         methods = {}
 698         for _c in range(method_body_count):
 699             method_idx = u30()
 700             u30()  # max_stack
 701             local_count = u30()
 702             u30()  # init_scope_depth
 703             u30()  # max_scope_depth
 704             code_length = u30()
 705             code = read_bytes(code_length)
 706             if method_idx in method_idxs:
 707                 m = Method(code, local_count)
 708                 methods[method_idxs[method_idx]] = m
 709             exception_count = u30()
 710             for _c2 in range(exception_count):
 711                 u30()  # from
 712                 u30()  # to
 713                 u30()  # target
 714                 u30()  # exc_type
 715                 u30()  # var_name
 716             trait_count = u30()
 717             for _c2 in range(trait_count):
 718                 parse_traits_info()
 719
 720         assert p + code_reader.tell() == len(code_tag)
 721         assert len(methods) == len(method_idxs)
 722
 723         method_pyfunctions = {}
 724
 725         def extract_function(func_name):
 726             if func_name in method_pyfunctions:
 727                 return method_pyfunctions[func_name]
 728             if func_name not in methods:
 729                 raise ExtractorError(u'Cannot find function %r' % func_name)
 730             m = methods[func_name]
 731
 732             def resfunc(args):
 733                 registers = ['(this)'] + list(args) + [None] * m.local_count
 734                 stack = []
 735                 coder = io.BytesIO(m.code)
 736                 while True:
 737                     opcode = struct.unpack('!B', coder.read(1))[0]
 738                     if opcode == 36:  # pushbyte
 739                         v = struct.unpack('!B', coder.read(1))[0]
 740                         stack.append(v)
 741                     elif opcode == 44:  # pushstring
 742                         idx = u30(coder)
 743                         stack.append(constant_strings[idx])
 744                     elif opcode == 48:  # pushscope
 745                         # We don't implement the scope register, so we'll just
 746                         # ignore the popped value
 747                         stack.pop()
 748                     elif opcode == 70:  # callproperty
 749                         index = u30(coder)
 750                         mname = multinames[index]
 751                         arg_count = u30(coder)
 752                         args = list(reversed(
 753                             [stack.pop() for _ in range(arg_count)]))
 754                         obj = stack.pop()
 755                         if mname == u'split':
 756                             assert len(args) == 1
 757                             assert isinstance(args[0], compat_str)
 758                             assert isinstance(obj, compat_str)
 759                             if args[0] == u'':
 760                                 res = list(obj)
 761                             else:
 762                                 res = obj.split(args[0])
 763                             stack.append(res)
 764                         elif mname == u'slice':
 765                             assert len(args) == 1
 766                             assert isinstance(args[0], int)
 767                             assert isinstance(obj, list)
 768                             res = obj[args[0]:]
 769                             stack.append(res)
 770                         elif mname == u'join':
 771                             assert len(args) == 1
 772                             assert isinstance(args[0], compat_str)
 773                             assert isinstance(obj, list)
 774                             res = args[0].join(obj)
 775                             stack.append(res)
 776                         elif mname in method_pyfunctions:
 777                             stack.append(method_pyfunctions[mname](args))
 778                         else:
 779                             raise NotImplementedError(
 780                                 u'Unsupported property %r on %r'
 781                                 % (mname, obj))
 782                     elif opcode == 72:  # returnvalue
 783                         res = stack.pop()
 784                         return res
 785                     elif opcode == 79:  # callpropvoid
 786                         index = u30(coder)
 787                         mname = multinames[index]
 788                         arg_count = u30(coder)
 789                         args = list(reversed(
 790                             [stack.pop() for _ in range(arg_count)]))
 791                         obj = stack.pop()
 792                         if mname == u'reverse':
 793                             assert isinstance(obj, list)
 794                             obj.reverse()
 795                         else:
 796                             raise NotImplementedError(
 797                                 u'Unsupported (void) property %r on %r'
 798                                 % (mname, obj))
 799                     elif opcode == 93:  # findpropstrict
 800                         index = u30(coder)
 801                         mname = multinames[index]
 802                         res = extract_function(mname)
 803                         stack.append(res)
 804                     elif opcode == 97:  # setproperty
 805                         index = u30(coder)
 806                         value = stack.pop()
 807                         idx = stack.pop()
 808                         obj = stack.pop()
 809                         assert isinstance(obj, list)
 810                         assert isinstance(idx, int)
 811                         obj[idx] = value
 812                     elif opcode == 98:  # getlocal
 813                         index = u30(coder)
 814                         stack.append(registers[index])
 815                     elif opcode == 99:  # setlocal
 816                         index = u30(coder)
 817                         value = stack.pop()
 818                         registers[index] = value
 819                     elif opcode == 102:  # getproperty
 820                         index = u30(coder)
 821                         pname = multinames[index]
 822                         if pname == u'length':
 823                             obj = stack.pop()
 824                             assert isinstance(obj, list)
 825                             stack.append(len(obj))
 826                         else:  # Assume attribute access
 827                             idx = stack.pop()
 828                             assert isinstance(idx, int)
 829                             obj = stack.pop()
 830                             assert isinstance(obj, list)
 831                             stack.append(obj[idx])
 832                     elif opcode == 128:  # coerce
 833                         u30(coder)
 834                     elif opcode == 133:  # coerce_s
 835                         assert isinstance(stack[-1], (type(None), compat_str))
 836                     elif opcode == 164:  # modulo
 837                         value2 = stack.pop()
 838                         value1 = stack.pop()
 839                         res = value1 % value2
 840                         stack.append(res)
 841                     elif opcode == 208:  # getlocal_0
 842                         stack.append(registers[0])
 843                     elif opcode == 209:  # getlocal_1
 844                         stack.append(registers[1])
 845                     elif opcode == 210:  # getlocal_2
 846                         stack.append(registers[2])
 847                     elif opcode == 211:  # getlocal_3
 848                         stack.append(registers[3])
 849                     elif opcode == 214:  # setlocal_2
 850                         registers[2] = stack.pop()
 851                     elif opcode == 215:  # setlocal_3
 852                         registers[3] = stack.pop()
 853                     else:
 854                         raise NotImplementedError(
 855                             u'Unsupported opcode %d' % opcode)
 856
 857             method_pyfunctions[func_name] = resfunc
 858             return resfunc
 859
 860         initial_function = extract_function(u'decipher')
 861         return lambda s: initial_function([s])
 862
 863     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 864         """Turn the encrypted s field into a working signature"""
 865
 866         if player_url is not None:
 867             if player_url.startswith(u'//'):
 868                 player_url = u'https:' + player_url
 869             try:
 870                 player_id = (player_url, len(s))
 871                 if player_id not in self._player_cache:
 872                     func = self._extract_signature_function(
 873                         video_id, player_url, len(s)
 874                     )
 875                     self._player_cache[player_id] = func
 876                 func = self._player_cache[player_id]
 877                 if self._downloader.params.get('youtube_print_sig_code'):
 878                     self._print_sig_code(func, len(s))
 879                 return func(s)
 880             except Exception:
 881                 tb = traceback.format_exc()
 882                 self._downloader.report_warning(
 883                     u'Automatic signature extraction failed: ' + tb)
 884
 885             self._downloader.report_warning(
 886                 u'Warning: Falling back to static signature algorithm')
 887
 888         return self._static_decrypt_signature(
 889             s, video_id, player_url, age_gate)
 890
 891     def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
 892         if age_gate:
 893             # The videos with age protection use another player, so the
 894             # algorithms can be different.
 895             if len(s) == 86:
 896                 return s[2:63] + s[82] + s[64:82] + s[63]
 897
 898         if len(s) == 93:
 899             return s[86:29:-1] + s[88] + s[28:5:-1]
 900         elif len(s) == 92:
 901             return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
 902         elif len(s) == 91:
 903             return s[84:27:-1] + s[86] + s[26:5:-1]
 904         elif len(s) == 90:
 905             return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
 906         elif len(s) == 89:
 907             return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
 908         elif len(s) == 88:
 909             return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
 910         elif len(s) == 87:
 911             return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
 912         elif len(s) == 86:
 913             return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
 914         elif len(s) == 85:
 915             return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
 916         elif len(s) == 84:
 917             return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
 918         elif len(s) == 83:
 919             return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
 920         elif len(s) == 82:
 921             return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
 922         elif len(s) == 81:
 923             return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
 924         elif len(s) == 80:
 925             return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
 926         elif len(s) == 79:
 927             return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
 928
 929         else:
 930             raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
 931
 932     def _get_available_subtitles(self, video_id, webpage):
 933         try:
 934             sub_list = self._download_webpage(
 935                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 936                 video_id, note=False)
 937         except ExtractorError as err:
 938             self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 939             return {}
 940         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 941
 942         sub_lang_list = {}
 943         for l in lang_list:
 944             lang = l[1]
 945             params = compat_urllib_parse.urlencode({
 946                 'lang': lang,
 947                 'v': video_id,
 948                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 949                 'name': unescapeHTML(l[0]).encode('utf-8'),
 950             })
 951             url = u'https://www.youtube.com/api/timedtext?' + params
 952             sub_lang_list[lang] = url
 953         if not sub_lang_list:
 954             self._downloader.report_warning(u'video doesn\'t have subtitles')
 955             return {}
 956         return sub_lang_list
 957
 958     def _get_available_automatic_caption(self, video_id, webpage):
 959         """We need the webpage for getting the captions url, pass it as an
 960            argument to speed up the process."""
 961         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 962         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 963         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 964         err_msg = u'Couldn\'t find automatic captions for %s' % video_id
 965         if mobj is None:
 966             self._downloader.report_warning(err_msg)
 967             return {}
 968         player_config = json.loads(mobj.group(1))
 969         try:
 970             args = player_config[u'args']
 971             caption_url = args[u'ttsurl']
 972             timestamp = args[u'timestamp']
 973             # We get the available subtitles
 974             list_params = compat_urllib_parse.urlencode({
 975                 'type': 'list',
 976                 'tlangs': 1,
 977                 'asrs': 1,
 978             })
 979             list_url = caption_url + '&' + list_params
 980             caption_list = self._download_xml(list_url, video_id)
 981             original_lang_node = caption_list.find('track')
 982             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
 983                 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
 984                 return {}
 985             original_lang = original_lang_node.attrib['lang_code']
 986
 987             sub_lang_list = {}
 988             for lang_node in caption_list.findall('target'):
 989                 sub_lang = lang_node.attrib['lang_code']
 990                 params = compat_urllib_parse.urlencode({
 991                     'lang': original_lang,
 992                     'tlang': sub_lang,
 993                     'fmt': sub_format,
 994                     'ts': timestamp,
 995                     'kind': 'asr',
 996                 })
 997                 sub_lang_list[sub_lang] = caption_url + '&' + params
 998             return sub_lang_list
 999         # An extractor error can be raise by the download process if there are
1000         # no automatic captions but there are subtitles
1001         except (KeyError, ExtractorError):
1002             self._downloader.report_warning(err_msg)
1003             return {}
1004
1005     @classmethod
1006     def extract_id(cls, url):
1007         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1008         if mobj is None:
1009             raise ExtractorError(u'Invalid URL: %s' % url)
1010         video_id = mobj.group(2)
1011         return video_id
1012
1013     def _extract_from_m3u8(self, manifest_url, video_id):
1014         url_map = {}
1015         def _get_urls(_manifest):
1016             lines = _manifest.split('\n')
1017             urls = filter(lambda l: l and not l.startswith('#'),
1018                             lines)
1019             return urls
1020         manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1021         formats_urls = _get_urls(manifest)
1022         for format_url in formats_urls:
1023             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1024             url_map[itag] = format_url
1025         return url_map
1026
1027     def _extract_annotations(self, video_id):
1028         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1029         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1030
1031     def _real_extract(self, url):
1032         proto = (
1033             u'http' if self._downloader.params.get('prefer_insecure', False)
1034             else u'https')
1035
1036         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1037         mobj = re.search(self._NEXT_URL_RE, url)
1038         if mobj:
1039             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1040         video_id = self.extract_id(url)
1041
1042         # Get video webpage
1043         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1044         video_webpage = self._download_webpage(url, video_id)
1045
1046         # Attempt to extract SWF player URL
1047         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1048         if mobj is not None:
1049             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1050         else:
1051             player_url = None
1052
1053         # Get video info
1054         self.report_video_info_webpage_download(video_id)
1055         if re.search(r'player-age-gate-content">', video_webpage) is not None:
1056             self.report_age_confirmation()
1057             age_gate = True
1058             # We simulate the access to the video from www.youtube.com/v/{video_id}
1059             # this can be viewed without login into Youtube
1060             data = compat_urllib_parse.urlencode({'video_id': video_id,
1061                                                   'el': 'player_embedded',
1062                                                   'gl': 'US',
1063                                                   'hl': 'en',
1064                                                   'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1065                                                   'asv': 3,
1066                                                   'sts':'1588',
1067                                                   })
1068             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1069             video_info_webpage = self._download_webpage(video_info_url, video_id,
1070                                     note=False,
1071                                     errnote='unable to download video info webpage')
1072             video_info = compat_parse_qs(video_info_webpage)
1073         else:
1074             age_gate = False
1075             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1076                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1077                         % (video_id, el_type))
1078                 video_info_webpage = self._download_webpage(video_info_url, video_id,
1079                                         note=False,
1080                                         errnote='unable to download video info webpage')
1081                 video_info = compat_parse_qs(video_info_webpage)
1082                 if 'token' in video_info:
1083                     break
1084         if 'token' not in video_info:
1085             if 'reason' in video_info:
1086                 raise ExtractorError(
1087                     u'YouTube said: %s' % video_info['reason'][0],
1088                     expected=True, video_id=video_id)
1089             else:
1090                 raise ExtractorError(
1091                     u'"token" parameter not in video info for unknown reason',
1092                     video_id=video_id)
1093
1094         if 'view_count' in video_info:
1095             view_count = int(video_info['view_count'][0])
1096         else:
1097             view_count = None
1098
1099         # Check for "rental" videos
1100         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1101             raise ExtractorError(u'"rental" videos not supported')
1102
1103         # Start extracting information
1104         self.report_information_extraction(video_id)
1105
1106         # uploader
1107         if 'author' not in video_info:
1108             raise ExtractorError(u'Unable to extract uploader name')
1109         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1110
1111         # uploader_id
1112         video_uploader_id = None
1113         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1114         if mobj is not None:
1115             video_uploader_id = mobj.group(1)
1116         else:
1117             self._downloader.report_warning(u'unable to extract uploader nickname')
1118
1119         # title
1120         if 'title' in video_info:
1121             video_title = video_info['title'][0]
1122         else:
1123             self._downloader.report_warning(u'Unable to extract video title')
1124             video_title = u'_'
1125
1126         # thumbnail image
1127         # We try first to get a high quality image:
1128         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1129                             video_webpage, re.DOTALL)
1130         if m_thumb is not None:
1131             video_thumbnail = m_thumb.group(1)
1132         elif 'thumbnail_url' not in video_info:
1133             self._downloader.report_warning(u'unable to extract video thumbnail')
1134             video_thumbnail = None
1135         else:   # don't panic if we can't find it
1136             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1137
1138         # upload date
1139         upload_date = None
1140         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
1141         if mobj is None:
1142             mobj = re.search(
1143                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded) on (.*?)</strong>',
1144                 video_webpage)
1145         if mobj is not None:
1146             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1147             upload_date = unified_strdate(upload_date)
1148
1149         m_cat_container = get_element_by_id("eow-category", video_webpage)
1150         if m_cat_container:
1151             category = self._html_search_regex(
1152                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1153                 default=None)
1154             video_categories = None if category is None else [category]
1155         else:
1156             video_categories = None
1157
1158         # description
1159         video_description = get_element_by_id("eow-description", video_webpage)
1160         if video_description:
1161             video_description = re.sub(r'''(?x)
1162                 <a\s+
1163                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1164                     title="([^"]+)"\s+
1165                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1166                     class="yt-uix-redirect-link"\s*>
1167                 [^<]+
1168                 </a>
1169             ''', r'\1', video_description)
1170             video_description = clean_html(video_description)
1171         else:
1172             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1173             if fd_mobj:
1174                 video_description = unescapeHTML(fd_mobj.group(1))
1175             else:
1176                 video_description = u''
1177
1178         def _extract_count(klass):
1179             count = self._search_regex(
1180                 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1181                 video_webpage, klass, default=None)
1182             if count is not None:
1183                 return int(count.replace(',', ''))
1184             return None
1185         like_count = _extract_count(u'likes-count')
1186         dislike_count = _extract_count(u'dislikes-count')
1187
1188         # subtitles
1189         video_subtitles = self.extract_subtitles(video_id, video_webpage)
1190
1191         if self._downloader.params.get('listsubtitles', False):
1192             self._list_available_subtitles(video_id, video_webpage)
1193             return
1194
1195         if 'length_seconds' not in video_info:
1196             self._downloader.report_warning(u'unable to extract video duration')
1197             video_duration = None
1198         else:
1199             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1200
1201         # annotations
1202         video_annotations = None
1203         if self._downloader.params.get('writeannotations', False):
1204                 video_annotations = self._extract_annotations(video_id)
1205
1206         # Decide which formats to download
1207         try:
1208             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1209             if not mobj:
1210                 raise ValueError('Could not find vevo ID')
1211             json_code = uppercase_escape(mobj.group(1))
1212             ytplayer_config = json.loads(json_code)
1213             args = ytplayer_config['args']
1214             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1215             # this signatures are encrypted
1216             if 'url_encoded_fmt_stream_map' not in args:
1217                 raise ValueError(u'No stream_map present')  # caught below
1218             re_signature = re.compile(r'[&,]s=')
1219             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1220             if m_s is not None:
1221                 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1222                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1223             m_s = re_signature.search(args.get('adaptive_fmts', u''))
1224             if m_s is not None:
1225                 if 'adaptive_fmts' in video_info:
1226                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1227                 else:
1228                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1229         except ValueError:
1230             pass
1231
1232         def _map_to_format_list(urlmap):
1233             formats = []
1234             for itag, video_real_url in urlmap.items():
1235                 dct = {
1236                     'format_id': itag,
1237                     'url': video_real_url,
1238                     'player_url': player_url,
1239                 }
1240                 if itag in self._formats:
1241                     dct.update(self._formats[itag])
1242                 formats.append(dct)
1243             return formats
1244
1245         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1246             self.report_rtmp_download()
1247             formats = [{
1248                 'format_id': '_rtmp',
1249                 'protocol': 'rtmp',
1250                 'url': video_info['conn'][0],
1251                 'player_url': player_url,
1252             }]
1253         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1254             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1255             if 'rtmpe%3Dyes' in encoded_url_map:
1256                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1257             url_map = {}
1258             for url_data_str in encoded_url_map.split(','):
1259                 url_data = compat_parse_qs(url_data_str)
1260                 if 'itag' in url_data and 'url' in url_data:
1261                     url = url_data['url'][0]
1262                     if 'sig' in url_data:
1263                         url += '&signature=' + url_data['sig'][0]
1264                     elif 's' in url_data:
1265                         encrypted_sig = url_data['s'][0]
1266                         if self._downloader.params.get('verbose'):
1267                             if age_gate:
1268                                 if player_url is None:
1269                                     player_version = 'unknown'
1270                                 else:
1271                                     player_version = self._search_regex(
1272                                         r'-(.+)\.swf$', player_url,
1273                                         u'flash player', fatal=False)
1274                                 player_desc = 'flash player %s' % player_version
1275                             else:
1276                                 player_version = self._search_regex(
1277                                     r'html5player-(.+?)\.js', video_webpage,
1278                                     'html5 player', fatal=False)
1279                                 player_desc = u'html5 player %s' % player_version
1280
1281                             parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1282                             self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1283                                 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1284
1285                         if not age_gate:
1286                             jsplayer_url_json = self._search_regex(
1287                                 r'"assets":.+?"js":\s*("[^"]+")',
1288                                 video_webpage, u'JS player URL')
1289                             player_url = json.loads(jsplayer_url_json)
1290
1291                         signature = self._decrypt_signature(
1292                             encrypted_sig, video_id, player_url, age_gate)
1293                         url += '&signature=' + signature
1294                     if 'ratebypass' not in url:
1295                         url += '&ratebypass=yes'
1296                     url_map[url_data['itag'][0]] = url
1297             formats = _map_to_format_list(url_map)
1298         elif video_info.get('hlsvp'):
1299             manifest_url = video_info['hlsvp'][0]
1300             url_map = self._extract_from_m3u8(manifest_url, video_id)
1301             formats = _map_to_format_list(url_map)
1302         else:
1303             raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1304
1305         # Look for the DASH manifest
1306         if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1307             try:
1308                 # The DASH manifest used needs to be the one from the original video_webpage.
1309                 # The one found in get_video_info seems to be using different signatures.
1310                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1311                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1312                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1313                 if age_gate:
1314                     dash_manifest_url = video_info.get('dashmpd')[0]
1315                 else:
1316                     dash_manifest_url = ytplayer_config['args']['dashmpd']
1317                 def decrypt_sig(mobj):
1318                     s = mobj.group(1)
1319                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1320                     return '/signature/%s' % dec_s
1321                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1322                 dash_doc = self._download_xml(
1323                     dash_manifest_url, video_id,
1324                     note=u'Downloading DASH manifest',
1325                     errnote=u'Could not download DASH manifest')
1326                 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1327                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1328                     if url_el is None:
1329                         continue
1330                     format_id = r.attrib['id']
1331                     video_url = url_el.text
1332                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1333                     f = {
1334                         'format_id': format_id,
1335                         'url': video_url,
1336                         'width': int_or_none(r.attrib.get('width')),
1337                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1338                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1339                         'filesize': filesize,
1340                     }
1341                     try:
1342                         existing_format = next(
1343                             fo for fo in formats
1344                             if fo['format_id'] == format_id)
1345                     except StopIteration:
1346                         f.update(self._formats.get(format_id, {}))
1347                         formats.append(f)
1348                     else:
1349                         existing_format.update(f)
1350
1351             except (ExtractorError, KeyError) as e:
1352                 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1353
1354         self._sort_formats(formats)
1355
1356         return {
1357             'id':           video_id,
1358             'uploader':     video_uploader,
1359             'uploader_id':  video_uploader_id,
1360             'upload_date':  upload_date,
1361             'title':        video_title,
1362             'thumbnail':    video_thumbnail,
1363             'description':  video_description,
1364             'categories':   video_categories,
1365             'subtitles':    video_subtitles,
1366             'duration':     video_duration,
1367             'age_limit':    18 if age_gate else 0,
1368             'annotations':  video_annotations,
1369             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1370             'view_count':   view_count,
1371             'like_count': like_count,
1372             'dislike_count': dislike_count,
1373             'formats':      formats,
1374         }
1375
1376 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1377     IE_DESC = u'YouTube.com playlists'
1378     _VALID_URL = r"""(?x)(?:
1379                         (?:https?://)?
1380                         (?:\w+\.)?
1381                         youtube\.com/
1382                         (?:
1383                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1384                            \? (?:.*?&)*? (?:p|a|list)=
1385                         |  p/
1386                         )
1387                         (
1388                             (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1389                             # Top tracks, they can also include dots
1390                             |(?:MC)[\w\.]*
1391                         )
1392                         .*
1393                      |
1394                         ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1395                      )"""
1396     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1397     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1398     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1399     IE_NAME = u'youtube:playlist'
1400
1401     def _real_initialize(self):
1402         self._login()
1403
1404     def _ids_to_results(self, ids):
1405         return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1406                        for vid_id in ids]
1407
1408     def _extract_mix(self, playlist_id):
1409         # The mixes are generated from a a single video
1410         # the id of the playlist is just 'RD' + video_id
1411         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1412         webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1413         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1414         title_span = (search_title('playlist-title') or
1415             search_title('title long-title') or search_title('title'))
1416         title = clean_html(title_span)
1417         video_re = r'''(?x)data-video-username="(.*?)".*?
1418                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1419         matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1420         # Some of the videos may have been deleted, their username field is empty
1421         ids = [video_id for (username, video_id) in matches if username]
1422         url_results = self._ids_to_results(ids)
1423
1424         return self.playlist_result(url_results, playlist_id, title)
1425
1426     def _real_extract(self, url):
1427         # Extract playlist id
1428         mobj = re.match(self._VALID_URL, url)
1429         if mobj is None:
1430             raise ExtractorError(u'Invalid URL: %s' % url)
1431         playlist_id = mobj.group(1) or mobj.group(2)
1432
1433         # Check if it's a video-specific URL
1434         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1435         if 'v' in query_dict:
1436             video_id = query_dict['v'][0]
1437             if self._downloader.params.get('noplaylist'):
1438                 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1439                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1440             else:
1441                 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1442
1443         if playlist_id.startswith('RD'):
1444             # Mixes require a custom extraction process
1445             return self._extract_mix(playlist_id)
1446         if playlist_id.startswith('TL'):
1447             raise ExtractorError(u'For downloading YouTube.com top lists, use '
1448                 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1449
1450         url = self._TEMPLATE_URL % playlist_id
1451         page = self._download_webpage(url, playlist_id)
1452         more_widget_html = content_html = page
1453
1454         # Check if the playlist exists or is private
1455         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1456             raise ExtractorError(
1457                 u'The playlist doesn\'t exist or is private, use --username or '
1458                 '--netrc to access it.',
1459                 expected=True)
1460
1461         # Extract the video ids from the playlist pages
1462         ids = []
1463
1464         for page_num in itertools.count(1):
1465             matches = re.finditer(self._VIDEO_RE, content_html)
1466             # We remove the duplicates and the link with index 0
1467             # (it's not the first video of the playlist)
1468             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1469             ids.extend(new_ids)
1470
1471             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1472             if not mobj:
1473                 break
1474
1475             more = self._download_json(
1476                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1477                 'Downloading page #%s' % page_num,
1478                 transform_source=uppercase_escape)
1479             content_html = more['content_html']
1480             more_widget_html = more['load_more_widget_html']
1481
1482         playlist_title = self._html_search_regex(
1483             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1484             page, u'title')
1485
1486         url_results = self._ids_to_results(ids)
1487         return self.playlist_result(url_results, playlist_id, playlist_title)
1488
1489
1490 class YoutubeTopListIE(YoutubePlaylistIE):
1491     IE_NAME = u'youtube:toplist'
1492     IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1493         u' (Example: "yttoplist:music:Top Tracks")')
1494     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1495
1496     def _real_extract(self, url):
1497         mobj = re.match(self._VALID_URL, url)
1498         channel = mobj.group('chann')
1499         title = mobj.group('title')
1500         query = compat_urllib_parse.urlencode({'title': title})
1501         playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1502         channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1503         link = self._html_search_regex(playlist_re, channel_page, u'list')
1504         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1505
1506         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1507         ids = []
1508         # sometimes the webpage doesn't contain the videos
1509         # retry until we get them
1510         for i in itertools.count(0):
1511             msg = u'Downloading Youtube mix'
1512             if i > 0:
1513                 msg += ', retry #%d' % i
1514             webpage = self._download_webpage(url, title, msg)
1515             ids = orderedSet(re.findall(video_re, webpage))
1516             if ids:
1517                 break
1518         url_results = self._ids_to_results(ids)
1519         return self.playlist_result(url_results, playlist_title=title)
1520
1521
1522 class YoutubeChannelIE(InfoExtractor):
1523     IE_DESC = u'YouTube.com channels'
1524     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1525     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1526     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1527     IE_NAME = u'youtube:channel'
1528
1529     def extract_videos_from_page(self, page):
1530         ids_in_page = []
1531         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1532             if mobj.group(1) not in ids_in_page:
1533                 ids_in_page.append(mobj.group(1))
1534         return ids_in_page
1535
1536     def _real_extract(self, url):
1537         # Extract channel id
1538         mobj = re.match(self._VALID_URL, url)
1539         if mobj is None:
1540             raise ExtractorError(u'Invalid URL: %s' % url)
1541
1542         # Download channel page
1543         channel_id = mobj.group(1)
1544         video_ids = []
1545         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1546         channel_page = self._download_webpage(url, channel_id)
1547         autogenerated = re.search(r'''(?x)
1548                 class="[^"]*?(?:
1549                     channel-header-autogenerated-label|
1550                     yt-channel-title-autogenerated
1551                 )[^"]*"''', channel_page) is not None
1552
1553         if autogenerated:
1554             # The videos are contained in a single page
1555             # the ajax pages can't be used, they are empty
1556             video_ids = self.extract_videos_from_page(channel_page)
1557         else:
1558             # Download all channel pages using the json-based channel_ajax query
1559             for pagenum in itertools.count(1):
1560                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1561                 page = self._download_json(
1562                     url, channel_id, note=u'Downloading page #%s' % pagenum,
1563                     transform_source=uppercase_escape)
1564
1565                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1566                 video_ids.extend(ids_in_page)
1567
1568                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1569                     break
1570
1571         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1572
1573         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1574                        for video_id in video_ids]
1575         return self.playlist_result(url_entries, channel_id)
1576
1577
1578 class YoutubeUserIE(InfoExtractor):
1579     IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1580     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1581     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1582     _GDATA_PAGE_SIZE = 50
1583     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1584     IE_NAME = u'youtube:user'
1585
1586     @classmethod
1587     def suitable(cls, url):
1588         # Don't return True if the url can be extracted with other youtube
1589         # extractor, the regex would is too permissive and it would match.
1590         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1591         if any(ie.suitable(url) for ie in other_ies): return False
1592         else: return super(YoutubeUserIE, cls).suitable(url)
1593
1594     def _real_extract(self, url):
1595         # Extract username
1596         mobj = re.match(self._VALID_URL, url)
1597         if mobj is None:
1598             raise ExtractorError(u'Invalid URL: %s' % url)
1599
1600         username = mobj.group(1)
1601
1602         # Download video ids using YouTube Data API. Result size per
1603         # query is limited (currently to 50 videos) so we need to query
1604         # page by page until there are no video ids - it means we got
1605         # all of them.
1606
1607         def download_page(pagenum):
1608             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1609
1610             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1611             page = self._download_webpage(
1612                 gdata_url, username,
1613                 u'Downloading video ids from %d to %d' % (
1614                     start_index, start_index + self._GDATA_PAGE_SIZE))
1615
1616             try:
1617                 response = json.loads(page)
1618             except ValueError as err:
1619                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1620             if 'entry' not in response['feed']:
1621                 return
1622
1623             # Extract video identifiers
1624             entries = response['feed']['entry']
1625             for entry in entries:
1626                 title = entry['title']['$t']
1627                 video_id = entry['id']['$t'].split('/')[-1]
1628                 yield {
1629                     '_type': 'url',
1630                     'url': video_id,
1631                     'ie_key': 'Youtube',
1632                     'id': video_id,
1633                     'title': title,
1634                 }
1635         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1636
1637         return self.playlist_result(url_results, playlist_title=username)
1638
1639
1640 class YoutubeSearchIE(SearchInfoExtractor):
1641     IE_DESC = u'YouTube.com searches'
1642     _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1643     _MAX_RESULTS = 1000
1644     IE_NAME = u'youtube:search'
1645     _SEARCH_KEY = 'ytsearch'
1646
1647     def _get_n_results(self, query, n):
1648         """Get a specified number of results for a query"""
1649
1650         video_ids = []
1651         pagenum = 0
1652         limit = n
1653         PAGE_SIZE = 50
1654
1655         while (PAGE_SIZE * pagenum) < limit:
1656             result_url = self._API_URL % (
1657                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1658                 (PAGE_SIZE * pagenum) + 1)
1659             data_json = self._download_webpage(
1660                 result_url, video_id=u'query "%s"' % query,
1661                 note=u'Downloading page %s' % (pagenum + 1),
1662                 errnote=u'Unable to download API page')
1663             data = json.loads(data_json)
1664             api_response = data['data']
1665
1666             if 'items' not in api_response:
1667                 raise ExtractorError(
1668                     u'[youtube] No video results', expected=True)
1669
1670             new_ids = list(video['id'] for video in api_response['items'])
1671             video_ids += new_ids
1672
1673             limit = min(n, api_response['totalItems'])
1674             pagenum += 1
1675
1676         if len(video_ids) > n:
1677             video_ids = video_ids[:n]
1678         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1679                   for video_id in video_ids]
1680         return self.playlist_result(videos, query)
1681
1682
1683 class YoutubeSearchDateIE(YoutubeSearchIE):
1684     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1685     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1686     _SEARCH_KEY = 'ytsearchdate'
1687     IE_DESC = u'YouTube.com searches, newest videos first'
1688
1689
1690 class YoutubeSearchURLIE(InfoExtractor):
1691     IE_DESC = u'YouTube.com search URLs'
1692     IE_NAME = u'youtube:search_url'
1693     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1694
1695     def _real_extract(self, url):
1696         mobj = re.match(self._VALID_URL, url)
1697         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1698
1699         webpage = self._download_webpage(url, query)
1700         result_code = self._search_regex(
1701             r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
1702
1703         part_codes = re.findall(
1704             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1705         entries = []
1706         for part_code in part_codes:
1707             part_title = self._html_search_regex(
1708                 r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
1709             part_url_snippet = self._html_search_regex(
1710                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1711             part_url = compat_urlparse.urljoin(
1712                 'https://www.youtube.com/', part_url_snippet)
1713             entries.append({
1714                 '_type': 'url',
1715                 'url': part_url,
1716                 'title': part_title,
1717             })
1718
1719         return {
1720             '_type': 'playlist',
1721             'entries': entries,
1722             'title': query,
1723         }
1724
1725
1726 class YoutubeShowIE(InfoExtractor):
1727     IE_DESC = u'YouTube.com (multi-season) shows'
1728     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1729     IE_NAME = u'youtube:show'
1730
1731     def _real_extract(self, url):
1732         mobj = re.match(self._VALID_URL, url)
1733         show_name = mobj.group(1)
1734         webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1735         # There's one playlist for each season of the show
1736         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1737         self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1738         return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1739
1740
1741 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1742     """
1743     Base class for extractors that fetch info from
1744     http://www.youtube.com/feed_ajax
1745     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1746     """
1747     _LOGIN_REQUIRED = True
1748     # use action_load_personal_feed instead of action_load_system_feed
1749     _PERSONAL_FEED = False
1750
1751     @property
1752     def _FEED_TEMPLATE(self):
1753         action = 'action_load_system_feed'
1754         if self._PERSONAL_FEED:
1755             action = 'action_load_personal_feed'
1756         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1757
1758     @property
1759     def IE_NAME(self):
1760         return u'youtube:%s' % self._FEED_NAME
1761
1762     def _real_initialize(self):
1763         self._login()
1764
1765     def _real_extract(self, url):
1766         feed_entries = []
1767         paging = 0
1768         for i in itertools.count(1):
1769             info = self._download_json(self._FEED_TEMPLATE % paging,
1770                                           u'%s feed' % self._FEED_NAME,
1771                                           u'Downloading page %s' % i)
1772             feed_html = info.get('feed_html') or info.get('content_html')
1773             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1774             ids = orderedSet(m.group(1) for m in m_ids)
1775             feed_entries.extend(
1776                 self.url_result(video_id, 'Youtube', video_id=video_id)
1777                 for video_id in ids)
1778             mobj = re.search(
1779                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1780                 feed_html)
1781             if mobj is None:
1782                 break
1783             paging = mobj.group('paging')
1784         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1785
1786 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1787     IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1788     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1789     _FEED_NAME = 'subscriptions'
1790     _PLAYLIST_TITLE = u'Youtube Subscriptions'
1791
1792 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1793     IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1794     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1795     _FEED_NAME = 'recommended'
1796     _PLAYLIST_TITLE = u'Youtube Recommended videos'
1797
1798 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1799     IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1800     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1801     _FEED_NAME = 'watch_later'
1802     _PLAYLIST_TITLE = u'Youtube Watch Later'
1803     _PERSONAL_FEED = True
1804
1805 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1806     IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1807     _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1808     _FEED_NAME = 'history'
1809     _PERSONAL_FEED = True
1810     _PLAYLIST_TITLE = u'Youtube Watch History'
1811
1812 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1813     IE_NAME = u'youtube:favorites'
1814     IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1815     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1816     _LOGIN_REQUIRED = True
1817
1818     def _real_extract(self, url):
1819         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1820         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1821         return self.url_result(playlist_id, 'YoutubePlaylist')
1822
1823
1824 class YoutubeTruncatedURLIE(InfoExtractor):
1825     IE_NAME = 'youtube:truncated_url'
1826     IE_DESC = False  # Do not list
1827     _VALID_URL = r'''(?x)
1828         (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
1829         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1830     '''
1831
1832     def _real_extract(self, url):
1833         raise ExtractorError(
1834             u'Did you forget to quote the URL? Remember that & is a meta '
1835             u'character in most shells, so you want to put the URL in quotes, '
1836             u'like  youtube-dl '
1837             u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1838             u' or simply  youtube-dl BaW_jenozKc  .',
1839             expected=True)