_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import time
  11 import traceback
  12
  13 from .common import InfoExtractor, SearchInfoExtractor
  14 from .subtitles import SubtitlesInfoExtractor
  15 from ..jsinterp import JSInterpreter
  16 from ..swfinterp import SWFInterpreter
  17 from ..compat import (
  18     compat_chr,
  19     compat_parse_qs,
  20     compat_urllib_parse,
  21     compat_urllib_request,
  22     compat_urlparse,
  23     compat_str,
  24 )
  25 from ..utils import (
  26     clean_html,
  27     ExtractorError,
  28     get_element_by_attribute,
  29     get_element_by_id,
  30     int_or_none,
  31     OnDemandPagedList,
  32     orderedSet,
  33     unescapeHTML,
  34     unified_strdate,
  35     uppercase_escape,
  36 )
  37
  38
  39 class YoutubeBaseInfoExtractor(InfoExtractor):
  40     """Provide base functions for Youtube extractors"""
  41     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  42     _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
  43     _NETRC_MACHINE = 'youtube'
  44     # If True it will raise an error if no login info is provided
  45     _LOGIN_REQUIRED = False
  46
  47     def _set_language(self):
  48         self._set_cookie(
  49             '.youtube.com', 'PREF', 'f1=50000000&hl=en',
  50             # YouTube sets the expire time to about two months
  51             expire_time=time.time() + 2 * 30 * 24 * 3600)
  52
  53     def _login(self):
  54         """
  55         Attempt to log in to YouTube.
  56         True is returned if successful or skipped.
  57         False is returned if login failed.
  58
  59         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  60         """
  61         (username, password) = self._get_login_info()
  62         # No authentication to be performed
  63         if username is None:
  64             if self._LOGIN_REQUIRED:
  65                 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  66             return True
  67
  68         login_page = self._download_webpage(
  69             self._LOGIN_URL, None,
  70             note='Downloading login page',
  71             errnote='unable to fetch login page', fatal=False)
  72         if login_page is False:
  73             return
  74
  75         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  76                                   login_page, 'Login GALX parameter')
  77
  78         # Log in
  79         login_form_strs = {
  80             'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  81             'Email': username,
  82             'GALX': galx,
  83             'Passwd': password,
  84
  85             'PersistentCookie': 'yes',
  86             '_utf8': '霱',
  87             'bgresponse': 'js_disabled',
  88             'checkConnection': '',
  89             'checkedDomains': 'youtube',
  90             'dnConn': '',
  91             'pstMsg': '0',
  92             'rmShown': '1',
  93             'secTok': '',
  94             'signIn': 'Sign in',
  95             'timeStmp': '',
  96             'service': 'youtube',
  97             'uilel': '3',
  98             'hl': 'en_US',
  99         }
 100
 101         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 102         # chokes on unicode
 103         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
 104         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 105
 106         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 107         login_results = self._download_webpage(
 108             req, None,
 109             note='Logging in', errnote='unable to log in', fatal=False)
 110         if login_results is False:
 111             return False
 112
 113         if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 114             raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 115
 116         # Two-Factor
 117         # TODO add SMS and phone call support - these require making a request and then prompting the user
 118
 119         if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
 120             tfa_code = self._get_tfa_info()
 121
 122             if tfa_code is None:
 123                 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
 124                 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 125                 return False
 126
 127             # Unlike the first login form, secTok and timeStmp are both required for the TFA form
 128
 129             match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 130             if match is None:
 131                 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
 132             secTok = match.group(1)
 133             match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 134             if match is None:
 135                 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
 136             timeStmp = match.group(1)
 137
 138             tfa_form_strs = {
 139                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 140                 'smsToken': '',
 141                 'smsUserPin': tfa_code,
 142                 'smsVerifyPin': 'Verify',
 143
 144                 'PersistentCookie': 'yes',
 145                 'checkConnection': '',
 146                 'checkedDomains': 'youtube',
 147                 'pstMsg': '1',
 148                 'secTok': secTok,
 149                 'timeStmp': timeStmp,
 150                 'service': 'youtube',
 151                 'hl': 'en_US',
 152             }
 153             tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
 154             tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 155
 156             tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 157             tfa_results = self._download_webpage(
 158                 tfa_req, None,
 159                 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
 160
 161             if tfa_results is False:
 162                 return False
 163
 164             if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
 165                 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
 166                 return False
 167             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 168                 self._downloader.report_warning('unable to log in - did the page structure change?')
 169                 return False
 170             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 171                 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 172                 return False
 173
 174         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 175             self._downloader.report_warning('unable to log in: bad username or password')
 176             return False
 177         return True
 178
 179     def _real_initialize(self):
 180         if self._downloader is None:
 181             return
 182         self._set_language()
 183         if not self._login():
 184             return
 185
 186
 187 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 188     IE_DESC = 'YouTube.com'
 189     _VALID_URL = r"""(?x)^
 190                      (
 191                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
 192                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 193                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 194                             (?:www\.)?pwnyoutube\.com/|
 195                             (?:www\.)?yourepeat\.com/|
 196                             tube\.majestyc\.net/|
 197                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 198                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 199                          (?:                                                  # the various things that can precede the ID:
 200                              (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
 201                              |(?:                                             # or the v= param in all its forms
 202                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 203                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 204                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 205                                  v=
 206                              )
 207                          ))
 208                          |youtu\.be/                                          # just youtu.be/xxxx
 209                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 210                          )
 211                      )?                                                       # all until now is optional -> you can pass the naked ID
 212                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 213                      (?!.*?&list=)                                            # combined list/video URLs are handled by the playlist IE
 214                      (?(1).+)?                                                # if we found the ID, everything can follow
 215                      $"""
 216     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 217     _formats = {
 218         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 219         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 220         '13': {'ext': '3gp'},
 221         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 222         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 223         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 224         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 225         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 226         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 227         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 228         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 229         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 230         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 231         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 232         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 233
 234
 235         # 3d videos
 236         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 237         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 238         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 239         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 240         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 241         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 242         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 243
 244         # Apple HTTP Live Streaming
 245         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 246         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 247         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 248         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 249         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 250         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 251         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 252
 253         # DASH mp4 video
 254         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 255         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 256         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 257         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 258         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 259         '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},  # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
 260         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 261         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 262         '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 263         '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 264         '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
 265
 266         # Dash mp4 audio
 267         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 268         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 269         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 270
 271         # Dash webm
 272         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 273         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 274         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 275         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 276         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 277         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 278         '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
 279         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 280         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 281         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 282         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 283         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 284         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 285         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 286         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 287         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 288         '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 289         '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 290         '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 291         '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
 292         '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 293
 294         # Dash webm audio
 295         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
 296         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 297
 298         # Dash webm audio with opus inside
 299         '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
 300         '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
 301         '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
 302
 303         # RTMP (unnamed)
 304         '_rtmp': {'protocol': 'rtmp'},
 305     }
 306
 307     IE_NAME = 'youtube'
 308     _TESTS = [
 309         {
 310             'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
 311             'info_dict': {
 312                 'id': 'BaW_jenozKc',
 313                 'ext': 'mp4',
 314                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
 315                 'uploader': 'Philipp Hagemeister',
 316                 'uploader_id': 'phihag',
 317                 'upload_date': '20121002',
 318                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
 319                 'categories': ['Science & Technology'],
 320                 'like_count': int,
 321                 'dislike_count': int,
 322             }
 323         },
 324         {
 325             'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
 326             'note': 'Test generic use_cipher_signature video (#897)',
 327             'info_dict': {
 328                 'id': 'UxxajLWwzqY',
 329                 'ext': 'mp4',
 330                 'upload_date': '20120506',
 331                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
 332                 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
 333                 'uploader': 'Icona Pop',
 334                 'uploader_id': 'IconaPop',
 335             }
 336         },
 337         {
 338             'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
 339             'note': 'Test VEVO video with age protection (#956)',
 340             'info_dict': {
 341                 'id': '07FYdnEawAQ',
 342                 'ext': 'mp4',
 343                 'upload_date': '20130703',
 344                 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
 345                 'description': 'md5:64249768eec3bc4276236606ea996373',
 346                 'uploader': 'justintimberlakeVEVO',
 347                 'uploader_id': 'justintimberlakeVEVO',
 348             }
 349         },
 350         {
 351             'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
 352             'note': 'Embed-only video (#1746)',
 353             'info_dict': {
 354                 'id': 'yZIXLfi8CZQ',
 355                 'ext': 'mp4',
 356                 'upload_date': '20120608',
 357                 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
 358                 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
 359                 'uploader': 'SET India',
 360                 'uploader_id': 'setindia'
 361             }
 362         },
 363         {
 364             'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
 365             'note': '256k DASH audio (format 141) via DASH manifest',
 366             'info_dict': {
 367                 'id': 'a9LDPn-MO4I',
 368                 'ext': 'm4a',
 369                 'upload_date': '20121002',
 370                 'uploader_id': '8KVIDEO',
 371                 'description': '',
 372                 'uploader': '8KVIDEO',
 373                 'title': 'UHDTV TEST 8K VIDEO.mp4'
 374             },
 375             'params': {
 376                 'youtube_include_dash_manifest': True,
 377                 'format': '141',
 378             },
 379         },
 380         # DASH manifest with encrypted signature
 381         {
 382             'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 383             'info_dict': {
 384                 'id': 'IB3lcPjvWLA',
 385                 'ext': 'm4a',
 386                 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
 387                 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
 388                 'uploader': 'AfrojackVEVO',
 389                 'uploader_id': 'AfrojackVEVO',
 390                 'upload_date': '20131011',
 391             },
 392             'params': {
 393                 'youtube_include_dash_manifest': True,
 394                 'format': '141',
 395             },
 396         },
 397         # Controversy video
 398         {
 399             'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
 400             'info_dict': {
 401                 'id': 'T4XJQO3qol8',
 402                 'ext': 'mp4',
 403                 'upload_date': '20100909',
 404                 'uploader': 'The Amazing Atheist',
 405                 'uploader_id': 'TheAmazingAtheist',
 406                 'title': 'Burning Everyone\'s Koran',
 407                 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
 408             }
 409         },
 410         # Normal age-gate video (No vevo, embed allowed)
 411         {
 412             'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
 413             'info_dict': {
 414                 'id': 'HtVdAasjOgU',
 415                 'ext': 'mp4',
 416                 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
 417                 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
 418                 'uploader': 'The Witcher',
 419                 'uploader_id': 'WitcherGame',
 420                 'upload_date': '20140605',
 421             },
 422         },
 423         # Age-gate video with encrypted signature
 424         {
 425             'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
 426             'info_dict': {
 427                 'id': '6kLq3WMV1nU',
 428                 'ext': 'mp4',
 429                 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
 430                 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
 431                 'uploader': 'LloydVEVO',
 432                 'uploader_id': 'LloydVEVO',
 433                 'upload_date': '20110629',
 434             },
 435         },
 436         # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
 437         {
 438             'url': '__2ABJjxzNo',
 439             'info_dict': {
 440                 'id': '__2ABJjxzNo',
 441                 'ext': 'mp4',
 442                 'upload_date': '20100430',
 443                 'uploader_id': 'deadmau5',
 444                 'description': 'md5:12c56784b8032162bb936a5f76d55360',
 445                 'uploader': 'deadmau5',
 446                 'title': 'Deadmau5 - Some Chords (HD)',
 447             },
 448             'expected_warnings': [
 449                 'DASH manifest missing',
 450             ]
 451         },
 452         # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
 453         {
 454             'url': 'lqQg6PlCWgI',
 455             'info_dict': {
 456                 'id': 'lqQg6PlCWgI',
 457                 'ext': 'mp4',
 458                 'upload_date': '20120731',
 459                 'uploader_id': 'olympic',
 460                 'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
 461                 'uploader': 'Olympics',
 462                 'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games',
 463             },
 464             'params': {
 465                 'skip_download': 'requires avconv',
 466             }
 467         },
 468         # Non-square pixels
 469         {
 470             'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
 471             'info_dict': {
 472                 'id': '_b-2C3KPAM0',
 473                 'ext': 'mp4',
 474                 'stretched_ratio': 16 / 9.,
 475                 'upload_date': '20110310',
 476                 'uploader_id': 'AllenMeow',
 477                 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
 478                 'uploader': '孫艾倫',
 479                 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
 480             },
 481         }
 482     ]
 483
 484     def __init__(self, *args, **kwargs):
 485         super(YoutubeIE, self).__init__(*args, **kwargs)
 486         self._player_cache = {}
 487
 488     def report_video_info_webpage_download(self, video_id):
 489         """Report attempt to download video info webpage."""
 490         self.to_screen('%s: Downloading video info webpage' % video_id)
 491
 492     def report_information_extraction(self, video_id):
 493         """Report attempt to extract video information."""
 494         self.to_screen('%s: Extracting video information' % video_id)
 495
 496     def report_unavailable_format(self, video_id, format):
 497         """Report extracted video URL."""
 498         self.to_screen('%s: Format %s not available' % (video_id, format))
 499
 500     def report_rtmp_download(self):
 501         """Indicate the download will use the RTMP protocol."""
 502         self.to_screen('RTMP download detected')
 503
 504     def _signature_cache_id(self, example_sig):
 505         """ Return a string representation of a signature """
 506         return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 507
 508     def _extract_signature_function(self, video_id, player_url, example_sig):
 509         id_m = re.match(
 510             r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
 511             player_url)
 512         if not id_m:
 513             raise ExtractorError('Cannot identify player %r' % player_url)
 514         player_type = id_m.group('ext')
 515         player_id = id_m.group('id')
 516
 517         # Read from filesystem cache
 518         func_id = '%s_%s_%s' % (
 519             player_type, player_id, self._signature_cache_id(example_sig))
 520         assert os.path.basename(func_id) == func_id
 521
 522         cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
 523         if cache_spec is not None:
 524             return lambda s: ''.join(s[i] for i in cache_spec)
 525
 526         if player_type == 'js':
 527             code = self._download_webpage(
 528                 player_url, video_id,
 529                 note='Downloading %s player %s' % (player_type, player_id),
 530                 errnote='Download of %s failed' % player_url)
 531             res = self._parse_sig_js(code)
 532         elif player_type == 'swf':
 533             urlh = self._request_webpage(
 534                 player_url, video_id,
 535                 note='Downloading %s player %s' % (player_type, player_id),
 536                 errnote='Download of %s failed' % player_url)
 537             code = urlh.read()
 538             res = self._parse_sig_swf(code)
 539         else:
 540             assert False, 'Invalid player type %r' % player_type
 541
 542         if cache_spec is None:
 543             test_string = ''.join(map(compat_chr, range(len(example_sig))))
 544             cache_res = res(test_string)
 545             cache_spec = [ord(c) for c in cache_res]
 546
 547         self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
 548         return res
 549
 550     def _print_sig_code(self, func, example_sig):
 551         def gen_sig_code(idxs):
 552             def _genslice(start, end, step):
 553                 starts = '' if start == 0 else str(start)
 554                 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
 555                 steps = '' if step == 1 else (':%d' % step)
 556                 return 's[%s%s%s]' % (starts, ends, steps)
 557
 558             step = None
 559             # Quelch pyflakes warnings - start will be set when step is set
 560             start = '(Never used)'
 561             for i, prev in zip(idxs[1:], idxs[:-1]):
 562                 if step is not None:
 563                     if i - prev == step:
 564                         continue
 565                     yield _genslice(start, prev, step)
 566                     step = None
 567                     continue
 568                 if i - prev in [-1, 1]:
 569                     step = i - prev
 570                     start = prev
 571                     continue
 572                 else:
 573                     yield 's[%d]' % prev
 574             if step is None:
 575                 yield 's[%d]' % i
 576             else:
 577                 yield _genslice(start, i, step)
 578
 579         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 580         cache_res = func(test_string)
 581         cache_spec = [ord(c) for c in cache_res]
 582         expr_code = ' + '.join(gen_sig_code(cache_spec))
 583         signature_id_tuple = '(%s)' % (
 584             ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 585         code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 586                 '    return %s\n') % (signature_id_tuple, expr_code)
 587         self.to_screen('Extracted signature function:\n' + code)
 588
 589     def _parse_sig_js(self, jscode):
 590         funcname = self._search_regex(
 591             r'\.sig\|\|([a-zA-Z0-9\$]+)\(', jscode,
 592             'Initial JS player signature function name')
 593
 594         jsi = JSInterpreter(jscode)
 595         initial_function = jsi.extract_function(funcname)
 596         return lambda s: initial_function([s])
 597
 598     def _parse_sig_swf(self, file_contents):
 599         swfi = SWFInterpreter(file_contents)
 600         TARGET_CLASSNAME = 'SignatureDecipher'
 601         searched_class = swfi.extract_class(TARGET_CLASSNAME)
 602         initial_function = swfi.extract_function(searched_class, 'decipher')
 603         return lambda s: initial_function([s])
 604
 605     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 606         """Turn the encrypted s field into a working signature"""
 607
 608         if player_url is None:
 609             raise ExtractorError('Cannot decrypt signature without player_url')
 610
 611         if player_url.startswith('//'):
 612             player_url = 'https:' + player_url
 613         try:
 614             player_id = (player_url, self._signature_cache_id(s))
 615             if player_id not in self._player_cache:
 616                 func = self._extract_signature_function(
 617                     video_id, player_url, s
 618                 )
 619                 self._player_cache[player_id] = func
 620             func = self._player_cache[player_id]
 621             if self._downloader.params.get('youtube_print_sig_code'):
 622                 self._print_sig_code(func, s)
 623             return func(s)
 624         except Exception as e:
 625             tb = traceback.format_exc()
 626             raise ExtractorError(
 627                 'Signature extraction failed: ' + tb, cause=e)
 628
 629     def _get_available_subtitles(self, video_id, webpage):
 630         try:
 631             subs_doc = self._download_xml(
 632                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 633                 video_id, note=False)
 634         except ExtractorError as err:
 635             self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
 636             return {}
 637
 638         sub_lang_list = {}
 639         for track in subs_doc.findall('track'):
 640             lang = track.attrib['lang_code']
 641             if lang in sub_lang_list:
 642                 continue
 643             params = compat_urllib_parse.urlencode({
 644                 'lang': lang,
 645                 'v': video_id,
 646                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 647                 'name': track.attrib['name'].encode('utf-8'),
 648             })
 649             url = 'https://www.youtube.com/api/timedtext?' + params
 650             sub_lang_list[lang] = url
 651         if not sub_lang_list:
 652             self._downloader.report_warning('video doesn\'t have subtitles')
 653             return {}
 654         return sub_lang_list
 655
 656     def _get_available_automatic_caption(self, video_id, webpage):
 657         """We need the webpage for getting the captions url, pass it as an
 658            argument to speed up the process."""
 659         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 660         self.to_screen('%s: Looking for automatic captions' % video_id)
 661         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 662         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
 663         if mobj is None:
 664             self._downloader.report_warning(err_msg)
 665             return {}
 666         player_config = json.loads(mobj.group(1))
 667         try:
 668             args = player_config['args']
 669             caption_url = args['ttsurl']
 670             timestamp = args['timestamp']
 671             # We get the available subtitles
 672             list_params = compat_urllib_parse.urlencode({
 673                 'type': 'list',
 674                 'tlangs': 1,
 675                 'asrs': 1,
 676             })
 677             list_url = caption_url + '&' + list_params
 678             caption_list = self._download_xml(list_url, video_id)
 679             original_lang_node = caption_list.find('track')
 680             if original_lang_node is None:
 681                 self._downloader.report_warning('Video doesn\'t have automatic captions')
 682                 return {}
 683             original_lang = original_lang_node.attrib['lang_code']
 684             caption_kind = original_lang_node.attrib.get('kind', '')
 685
 686             sub_lang_list = {}
 687             for lang_node in caption_list.findall('target'):
 688                 sub_lang = lang_node.attrib['lang_code']
 689                 params = compat_urllib_parse.urlencode({
 690                     'lang': original_lang,
 691                     'tlang': sub_lang,
 692                     'fmt': sub_format,
 693                     'ts': timestamp,
 694                     'kind': caption_kind,
 695                 })
 696                 sub_lang_list[sub_lang] = caption_url + '&' + params
 697             return sub_lang_list
 698         # An extractor error can be raise by the download process if there are
 699         # no automatic captions but there are subtitles
 700         except (KeyError, ExtractorError):
 701             self._downloader.report_warning(err_msg)
 702             return {}
 703
 704     @classmethod
 705     def extract_id(cls, url):
 706         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 707         if mobj is None:
 708             raise ExtractorError('Invalid URL: %s' % url)
 709         video_id = mobj.group(2)
 710         return video_id
 711
 712     def _extract_from_m3u8(self, manifest_url, video_id):
 713         url_map = {}
 714
 715         def _get_urls(_manifest):
 716             lines = _manifest.split('\n')
 717             urls = filter(lambda l: l and not l.startswith('#'),
 718                           lines)
 719             return urls
 720         manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
 721         formats_urls = _get_urls(manifest)
 722         for format_url in formats_urls:
 723             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 724             url_map[itag] = format_url
 725         return url_map
 726
 727     def _extract_annotations(self, video_id):
 728         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 729         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 730
 731     def _parse_dash_manifest(
 732             self, video_id, dash_manifest_url, player_url, age_gate):
 733         def decrypt_sig(mobj):
 734             s = mobj.group(1)
 735             dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
 736             return '/signature/%s' % dec_s
 737         dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
 738         dash_doc = self._download_xml(
 739             dash_manifest_url, video_id,
 740             note='Downloading DASH manifest',
 741             errnote='Could not download DASH manifest')
 742
 743         formats = []
 744         for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 745             url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 746             if url_el is None:
 747                 continue
 748             format_id = r.attrib['id']
 749             video_url = url_el.text
 750             filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 751             f = {
 752                 'format_id': format_id,
 753                 'url': video_url,
 754                 'width': int_or_none(r.attrib.get('width')),
 755                 'height': int_or_none(r.attrib.get('height')),
 756                 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
 757                 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
 758                 'filesize': filesize,
 759                 'fps': int_or_none(r.attrib.get('frameRate')),
 760             }
 761             try:
 762                 existing_format = next(
 763                     fo for fo in formats
 764                     if fo['format_id'] == format_id)
 765             except StopIteration:
 766                 f.update(self._formats.get(format_id, {}).items())
 767                 formats.append(f)
 768             else:
 769                 existing_format.update(f)
 770         return formats
 771
 772     def _real_extract(self, url):
 773         proto = (
 774             'http' if self._downloader.params.get('prefer_insecure', False)
 775             else 'https')
 776
 777         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 778         mobj = re.search(self._NEXT_URL_RE, url)
 779         if mobj:
 780             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 781         video_id = self.extract_id(url)
 782
 783         # Get video webpage
 784         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
 785         video_webpage = self._download_webpage(url, video_id)
 786
 787         # Attempt to extract SWF player URL
 788         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 789         if mobj is not None:
 790             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 791         else:
 792             player_url = None
 793
 794         # Get video info
 795         if re.search(r'player-age-gate-content">', video_webpage) is not None:
 796             age_gate = True
 797             # We simulate the access to the video from www.youtube.com/v/{video_id}
 798             # this can be viewed without login into Youtube
 799             url = proto + '://www.youtube.com/embed/%s' % video_id
 800             embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
 801             data = compat_urllib_parse.urlencode({
 802                 'video_id': video_id,
 803                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
 804                 'sts': self._search_regex(
 805                     r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
 806             })
 807             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
 808             video_info_webpage = self._download_webpage(
 809                 video_info_url, video_id,
 810                 note='Refetching age-gated info webpage',
 811                 errnote='unable to download video info webpage')
 812             video_info = compat_parse_qs(video_info_webpage)
 813         else:
 814             age_gate = False
 815             try:
 816                 # Try looking directly into the video webpage
 817                 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
 818                 if not mobj:
 819                     raise ValueError('Could not find ytplayer.config')  # caught below
 820                 json_code = uppercase_escape(mobj.group(1))
 821                 ytplayer_config = json.loads(json_code)
 822                 args = ytplayer_config['args']
 823                 # Convert to the same format returned by compat_parse_qs
 824                 video_info = dict((k, [v]) for k, v in args.items())
 825                 if 'url_encoded_fmt_stream_map' not in args:
 826                     raise ValueError('No stream_map present')  # caught below
 827             except ValueError:
 828                 # We fallback to the get_video_info pages (used by the embed page)
 829                 self.report_video_info_webpage_download(video_id)
 830                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 831                     video_info_url = (
 832                         '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 833                         % (proto, video_id, el_type))
 834                     video_info_webpage = self._download_webpage(
 835                         video_info_url,
 836                         video_id, note=False,
 837                         errnote='unable to download video info webpage')
 838                     video_info = compat_parse_qs(video_info_webpage)
 839                     if 'token' in video_info:
 840                         break
 841         if 'token' not in video_info:
 842             if 'reason' in video_info:
 843                 raise ExtractorError(
 844                     'YouTube said: %s' % video_info['reason'][0],
 845                     expected=True, video_id=video_id)
 846             else:
 847                 raise ExtractorError(
 848                     '"token" parameter not in video info for unknown reason',
 849                     video_id=video_id)
 850
 851         if 'view_count' in video_info:
 852             view_count = int(video_info['view_count'][0])
 853         else:
 854             view_count = None
 855
 856         # Check for "rental" videos
 857         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 858             raise ExtractorError('"rental" videos not supported')
 859
 860         # Start extracting information
 861         self.report_information_extraction(video_id)
 862
 863         # uploader
 864         if 'author' not in video_info:
 865             raise ExtractorError('Unable to extract uploader name')
 866         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 867
 868         # uploader_id
 869         video_uploader_id = None
 870         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 871         if mobj is not None:
 872             video_uploader_id = mobj.group(1)
 873         else:
 874             self._downloader.report_warning('unable to extract uploader nickname')
 875
 876         # title
 877         if 'title' in video_info:
 878             video_title = video_info['title'][0]
 879         else:
 880             self._downloader.report_warning('Unable to extract video title')
 881             video_title = '_'
 882
 883         # thumbnail image
 884         # We try first to get a high quality image:
 885         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
 886                             video_webpage, re.DOTALL)
 887         if m_thumb is not None:
 888             video_thumbnail = m_thumb.group(1)
 889         elif 'thumbnail_url' not in video_info:
 890             self._downloader.report_warning('unable to extract video thumbnail')
 891             video_thumbnail = None
 892         else:   # don't panic if we can't find it
 893             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 894
 895         # upload date
 896         upload_date = None
 897         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
 898         if mobj is None:
 899             mobj = re.search(
 900                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
 901                 video_webpage)
 902         if mobj is not None:
 903             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 904             upload_date = unified_strdate(upload_date)
 905
 906         m_cat_container = self._search_regex(
 907             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
 908             video_webpage, 'categories', default=None)
 909         if m_cat_container:
 910             category = self._html_search_regex(
 911                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
 912                 default=None)
 913             video_categories = None if category is None else [category]
 914         else:
 915             video_categories = None
 916
 917         # description
 918         video_description = get_element_by_id("eow-description", video_webpage)
 919         if video_description:
 920             video_description = re.sub(r'''(?x)
 921                 <a\s+
 922                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 923                     title="([^"]+)"\s+
 924                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 925                     class="yt-uix-redirect-link"\s*>
 926                 [^<]+
 927                 </a>
 928             ''', r'\1', video_description)
 929             video_description = clean_html(video_description)
 930         else:
 931             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 932             if fd_mobj:
 933                 video_description = unescapeHTML(fd_mobj.group(1))
 934             else:
 935                 video_description = ''
 936
 937         def _extract_count(count_name):
 938             count = self._search_regex(
 939                 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
 940                 video_webpage, count_name, default=None)
 941             if count is not None:
 942                 return int(count.replace(',', ''))
 943             return None
 944         like_count = _extract_count('like')
 945         dislike_count = _extract_count('dislike')
 946
 947         # subtitles
 948         video_subtitles = self.extract_subtitles(video_id, video_webpage)
 949
 950         if self._downloader.params.get('listsubtitles', False):
 951             self._list_available_subtitles(video_id, video_webpage)
 952             return
 953
 954         if 'length_seconds' not in video_info:
 955             self._downloader.report_warning('unable to extract video duration')
 956             video_duration = None
 957         else:
 958             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
 959
 960         # annotations
 961         video_annotations = None
 962         if self._downloader.params.get('writeannotations', False):
 963             video_annotations = self._extract_annotations(video_id)
 964
 965         def _map_to_format_list(urlmap):
 966             formats = []
 967             for itag, video_real_url in urlmap.items():
 968                 dct = {
 969                     'format_id': itag,
 970                     'url': video_real_url,
 971                     'player_url': player_url,
 972                 }
 973                 if itag in self._formats:
 974                     dct.update(self._formats[itag])
 975                 formats.append(dct)
 976             return formats
 977
 978         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 979             self.report_rtmp_download()
 980             formats = [{
 981                 'format_id': '_rtmp',
 982                 'protocol': 'rtmp',
 983                 'url': video_info['conn'][0],
 984                 'player_url': player_url,
 985             }]
 986         elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
 987             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
 988             if 'rtmpe%3Dyes' in encoded_url_map:
 989                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
 990             url_map = {}
 991             for url_data_str in encoded_url_map.split(','):
 992                 url_data = compat_parse_qs(url_data_str)
 993                 if 'itag' not in url_data or 'url' not in url_data:
 994                     continue
 995                 format_id = url_data['itag'][0]
 996                 url = url_data['url'][0]
 997
 998                 if 'sig' in url_data:
 999                     url += '&signature=' + url_data['sig'][0]
1000                 elif 's' in url_data:
1001                     encrypted_sig = url_data['s'][0]
1002
1003                     jsplayer_url_json = self._search_regex(
1004                         r'"assets":.+?"js":\s*("[^"]+")',
1005                         embed_webpage if age_gate else video_webpage, 'JS player URL')
1006                     player_url = json.loads(jsplayer_url_json)
1007                     if player_url is None:
1008                         player_url_json = self._search_regex(
1009                             r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1010                             video_webpage, 'age gate player URL')
1011                         player_url = json.loads(player_url_json)
1012
1013                     if self._downloader.params.get('verbose'):
1014                         if player_url is None:
1015                             player_version = 'unknown'
1016                             player_desc = 'unknown'
1017                         else:
1018                             if player_url.endswith('swf'):
1019                                 player_version = self._search_regex(
1020                                     r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1021                                     'flash player', fatal=False)
1022                                 player_desc = 'flash player %s' % player_version
1023                             else:
1024                                 player_version = self._search_regex(
1025                                     r'html5player-([^/]+?)(?:/html5player)?\.js',
1026                                     player_url,
1027                                     'html5 player', fatal=False)
1028                                 player_desc = 'html5 player %s' % player_version
1029
1030                         parts_sizes = self._signature_cache_id(encrypted_sig)
1031                         self.to_screen('{%s} signature length %s, %s' %
1032                                        (format_id, parts_sizes, player_desc))
1033
1034                     signature = self._decrypt_signature(
1035                         encrypted_sig, video_id, player_url, age_gate)
1036                     url += '&signature=' + signature
1037                 if 'ratebypass' not in url:
1038                     url += '&ratebypass=yes'
1039                 url_map[format_id] = url
1040             formats = _map_to_format_list(url_map)
1041         elif video_info.get('hlsvp'):
1042             manifest_url = video_info['hlsvp'][0]
1043             url_map = self._extract_from_m3u8(manifest_url, video_id)
1044             formats = _map_to_format_list(url_map)
1045         else:
1046             raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1047
1048         # Look for the DASH manifest
1049         if self._downloader.params.get('youtube_include_dash_manifest', True):
1050             dash_mpd = video_info.get('dashmpd')
1051             if dash_mpd:
1052                 dash_manifest_url = dash_mpd[0]
1053                 try:
1054                     dash_formats = self._parse_dash_manifest(
1055                         video_id, dash_manifest_url, player_url, age_gate)
1056                 except (ExtractorError, KeyError) as e:
1057                     self.report_warning(
1058                         'Skipping DASH manifest: %r' % e, video_id)
1059                 else:
1060                     # Hide the formats we found through non-DASH
1061                     dash_keys = set(df['format_id'] for df in dash_formats)
1062                     for f in formats:
1063                         if f['format_id'] in dash_keys:
1064                             f['format_id'] = 'nondash-%s' % f['format_id']
1065                             f['preference'] = f.get('preference', 0) - 10000
1066                     formats.extend(dash_formats)
1067
1068         # Check for malformed aspect ratio
1069         stretched_m = re.search(
1070             r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1071             video_webpage)
1072         if stretched_m:
1073             ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1074             for f in formats:
1075                 if f.get('vcodec') != 'none':
1076                     f['stretched_ratio'] = ratio
1077
1078         self._sort_formats(formats)
1079
1080         return {
1081             'id': video_id,
1082             'uploader': video_uploader,
1083             'uploader_id': video_uploader_id,
1084             'upload_date': upload_date,
1085             'title': video_title,
1086             'thumbnail': video_thumbnail,
1087             'description': video_description,
1088             'categories': video_categories,
1089             'subtitles': video_subtitles,
1090             'duration': video_duration,
1091             'age_limit': 18 if age_gate else 0,
1092             'annotations': video_annotations,
1093             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1094             'view_count': view_count,
1095             'like_count': like_count,
1096             'dislike_count': dislike_count,
1097             'formats': formats,
1098         }
1099
1100
1101 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1102     IE_DESC = 'YouTube.com playlists'
1103     _VALID_URL = r"""(?x)(?:
1104                         (?:https?://)?
1105                         (?:\w+\.)?
1106                         youtube\.com/
1107                         (?:
1108                            (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1109                            \? (?:.*?&)*? (?:p|a|list)=
1110                         |  p/
1111                         )
1112                         (
1113                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1114                             # Top tracks, they can also include dots
1115                             |(?:MC)[\w\.]*
1116                         )
1117                         .*
1118                      |
1119                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1120                      )"""
1121     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1122     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1123     IE_NAME = 'youtube:playlist'
1124     _TESTS = [{
1125         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1126         'info_dict': {
1127             'title': 'ytdl test PL',
1128             'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1129         },
1130         'playlist_count': 3,
1131     }, {
1132         'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1133         'info_dict': {
1134             'title': 'YDL_Empty_List',
1135         },
1136         'playlist_count': 0,
1137     }, {
1138         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1139         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1140         'info_dict': {
1141             'title': '29C3: Not my department',
1142         },
1143         'playlist_count': 95,
1144     }, {
1145         'note': 'issue #673',
1146         'url': 'PLBB231211A4F62143',
1147         'info_dict': {
1148             'title': '[OLD]Team Fortress 2 (Class-based LP)',
1149         },
1150         'playlist_mincount': 26,
1151     }, {
1152         'note': 'Large playlist',
1153         'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1154         'info_dict': {
1155             'title': 'Uploads from Cauchemar',
1156         },
1157         'playlist_mincount': 799,
1158     }, {
1159         'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1160         'info_dict': {
1161             'title': 'YDL_safe_search',
1162         },
1163         'playlist_count': 2,
1164     }, {
1165         'note': 'embedded',
1166         'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1167         'playlist_count': 4,
1168         'info_dict': {
1169             'title': 'JODA15',
1170         }
1171     }, {
1172         'note': 'Embedded SWF player',
1173         'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1174         'playlist_count': 4,
1175         'info_dict': {
1176             'title': 'JODA7',
1177         }
1178     }, {
1179         'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1180         'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1181         'info_dict': {
1182                 'title': 'Uploads from Interstellar Movie',
1183         },
1184         'playlist_mincout': 21,
1185     }]
1186
1187     def _real_initialize(self):
1188         self._login()
1189
1190     def _ids_to_results(self, ids):
1191         return [
1192             self.url_result(vid_id, 'Youtube', video_id=vid_id)
1193             for vid_id in ids]
1194
1195     def _extract_mix(self, playlist_id):
1196         # The mixes are generated from a a single video
1197         # the id of the playlist is just 'RD' + video_id
1198         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1199         webpage = self._download_webpage(
1200             url, playlist_id, 'Downloading Youtube mix')
1201         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1202         title_span = (
1203             search_title('playlist-title') or
1204             search_title('title long-title') or
1205             search_title('title'))
1206         title = clean_html(title_span)
1207         ids = orderedSet(re.findall(
1208             r'''(?xs)data-video-username=".*?".*?
1209                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1210             webpage))
1211         url_results = self._ids_to_results(ids)
1212
1213         return self.playlist_result(url_results, playlist_id, title)
1214
1215     def _real_extract(self, url):
1216         # Extract playlist id
1217         mobj = re.match(self._VALID_URL, url)
1218         if mobj is None:
1219             raise ExtractorError('Invalid URL: %s' % url)
1220         playlist_id = mobj.group(1) or mobj.group(2)
1221
1222         # Check if it's a video-specific URL
1223         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1224         if 'v' in query_dict:
1225             video_id = query_dict['v'][0]
1226             if self._downloader.params.get('noplaylist'):
1227                 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1228                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1229             else:
1230                 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1231
1232         if playlist_id.startswith('RD'):
1233             # Mixes require a custom extraction process
1234             return self._extract_mix(playlist_id)
1235
1236         url = self._TEMPLATE_URL % playlist_id
1237         page = self._download_webpage(url, playlist_id)
1238         more_widget_html = content_html = page
1239
1240         # Check if the playlist exists or is private
1241         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1242             raise ExtractorError(
1243                 'The playlist doesn\'t exist or is private, use --username or '
1244                 '--netrc to access it.',
1245                 expected=True)
1246
1247         # Extract the video ids from the playlist pages
1248         ids = []
1249
1250         for page_num in itertools.count(1):
1251             matches = re.finditer(self._VIDEO_RE, content_html)
1252             # We remove the duplicates and the link with index 0
1253             # (it's not the first video of the playlist)
1254             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1255             ids.extend(new_ids)
1256
1257             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1258             if not mobj:
1259                 break
1260
1261             more = self._download_json(
1262                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1263                 'Downloading page #%s' % page_num,
1264                 transform_source=uppercase_escape)
1265             content_html = more['content_html']
1266             if not content_html.strip():
1267                 # Some webpages show a "Load more" button but they don't
1268                 # have more videos
1269                 break
1270             more_widget_html = more['load_more_widget_html']
1271
1272         playlist_title = self._html_search_regex(
1273             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1274             page, 'title')
1275
1276         url_results = self._ids_to_results(ids)
1277         return self.playlist_result(url_results, playlist_id, playlist_title)
1278
1279
1280 class YoutubeChannelIE(InfoExtractor):
1281     IE_DESC = 'YouTube.com channels'
1282     _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1283     IE_NAME = 'youtube:channel'
1284     _TESTS = [{
1285         'note': 'paginated channel',
1286         'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1287         'playlist_mincount': 91,
1288     }]
1289
1290     def extract_videos_from_page(self, page):
1291         ids_in_page = []
1292         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1293             if mobj.group(1) not in ids_in_page:
1294                 ids_in_page.append(mobj.group(1))
1295         return ids_in_page
1296
1297     def _real_extract(self, url):
1298         channel_id = self._match_id(url)
1299
1300         video_ids = []
1301         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1302         channel_page = self._download_webpage(url, channel_id)
1303         autogenerated = re.search(r'''(?x)
1304                 class="[^"]*?(?:
1305                     channel-header-autogenerated-label|
1306                     yt-channel-title-autogenerated
1307                 )[^"]*"''', channel_page) is not None
1308
1309         if autogenerated:
1310             # The videos are contained in a single page
1311             # the ajax pages can't be used, they are empty
1312             video_ids = self.extract_videos_from_page(channel_page)
1313             entries = [
1314                 self.url_result(video_id, 'Youtube', video_id=video_id)
1315                 for video_id in video_ids]
1316             return self.playlist_result(entries, channel_id)
1317
1318         def _entries():
1319             more_widget_html = content_html = channel_page
1320             for pagenum in itertools.count(1):
1321
1322                 ids_in_page = self.extract_videos_from_page(content_html)
1323                 for video_id in ids_in_page:
1324                     yield self.url_result(
1325                         video_id, 'Youtube', video_id=video_id)
1326
1327                 mobj = re.search(
1328                     r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1329                     more_widget_html)
1330                 if not mobj:
1331                     break
1332
1333                 more = self._download_json(
1334                     'https://youtube.com/%s' % mobj.group('more'), channel_id,
1335                     'Downloading page #%s' % (pagenum + 1),
1336                     transform_source=uppercase_escape)
1337                 content_html = more['content_html']
1338                 more_widget_html = more['load_more_widget_html']
1339
1340         return self.playlist_result(_entries(), channel_id)
1341
1342
1343 class YoutubeUserIE(InfoExtractor):
1344     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1345     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1346     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1347     _GDATA_PAGE_SIZE = 50
1348     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1349     IE_NAME = 'youtube:user'
1350
1351     _TESTS = [{
1352         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1353         'playlist_mincount': 320,
1354         'info_dict': {
1355             'title': 'TheLinuxFoundation',
1356         }
1357     }, {
1358         'url': 'ytuser:phihag',
1359         'only_matching': True,
1360     }]
1361
1362     @classmethod
1363     def suitable(cls, url):
1364         # Don't return True if the url can be extracted with other youtube
1365         # extractor, the regex would is too permissive and it would match.
1366         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1367         if any(ie.suitable(url) for ie in other_ies):
1368             return False
1369         else:
1370             return super(YoutubeUserIE, cls).suitable(url)
1371
1372     def _real_extract(self, url):
1373         username = self._match_id(url)
1374
1375         # Download video ids using YouTube Data API. Result size per
1376         # query is limited (currently to 50 videos) so we need to query
1377         # page by page until there are no video ids - it means we got
1378         # all of them.
1379
1380         def download_page(pagenum):
1381             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1382
1383             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1384             page = self._download_webpage(
1385                 gdata_url, username,
1386                 'Downloading video ids from %d to %d' % (
1387                     start_index, start_index + self._GDATA_PAGE_SIZE))
1388
1389             try:
1390                 response = json.loads(page)
1391             except ValueError as err:
1392                 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1393             if 'entry' not in response['feed']:
1394                 return
1395
1396             # Extract video identifiers
1397             entries = response['feed']['entry']
1398             for entry in entries:
1399                 title = entry['title']['$t']
1400                 video_id = entry['id']['$t'].split('/')[-1]
1401                 yield {
1402                     '_type': 'url',
1403                     'url': video_id,
1404                     'ie_key': 'Youtube',
1405                     'id': video_id,
1406                     'title': title,
1407                 }
1408         url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1409
1410         return self.playlist_result(url_results, playlist_title=username)
1411
1412
1413 class YoutubeSearchIE(SearchInfoExtractor):
1414     IE_DESC = 'YouTube.com searches'
1415     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1416     _MAX_RESULTS = 1000
1417     IE_NAME = 'youtube:search'
1418     _SEARCH_KEY = 'ytsearch'
1419
1420     def _get_n_results(self, query, n):
1421         """Get a specified number of results for a query"""
1422
1423         video_ids = []
1424         pagenum = 0
1425         limit = n
1426         PAGE_SIZE = 50
1427
1428         while (PAGE_SIZE * pagenum) < limit:
1429             result_url = self._API_URL % (
1430                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1431                 (PAGE_SIZE * pagenum) + 1)
1432             data_json = self._download_webpage(
1433                 result_url, video_id='query "%s"' % query,
1434                 note='Downloading page %s' % (pagenum + 1),
1435                 errnote='Unable to download API page')
1436             data = json.loads(data_json)
1437             api_response = data['data']
1438
1439             if 'items' not in api_response:
1440                 raise ExtractorError(
1441                     '[youtube] No video results', expected=True)
1442
1443             new_ids = list(video['id'] for video in api_response['items'])
1444             video_ids += new_ids
1445
1446             limit = min(n, api_response['totalItems'])
1447             pagenum += 1
1448
1449         if len(video_ids) > n:
1450             video_ids = video_ids[:n]
1451         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1452                   for video_id in video_ids]
1453         return self.playlist_result(videos, query)
1454
1455
1456 class YoutubeSearchDateIE(YoutubeSearchIE):
1457     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1458     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1459     _SEARCH_KEY = 'ytsearchdate'
1460     IE_DESC = 'YouTube.com searches, newest videos first'
1461
1462
1463 class YoutubeSearchURLIE(InfoExtractor):
1464     IE_DESC = 'YouTube.com search URLs'
1465     IE_NAME = 'youtube:search_url'
1466     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1467     _TESTS = [{
1468         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1469         'playlist_mincount': 5,
1470         'info_dict': {
1471             'title': 'youtube-dl test video',
1472         }
1473     }]
1474
1475     def _real_extract(self, url):
1476         mobj = re.match(self._VALID_URL, url)
1477         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1478
1479         webpage = self._download_webpage(url, query)
1480         result_code = self._search_regex(
1481             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1482
1483         part_codes = re.findall(
1484             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1485         entries = []
1486         for part_code in part_codes:
1487             part_title = self._html_search_regex(
1488                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1489             part_url_snippet = self._html_search_regex(
1490                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1491             part_url = compat_urlparse.urljoin(
1492                 'https://www.youtube.com/', part_url_snippet)
1493             entries.append({
1494                 '_type': 'url',
1495                 'url': part_url,
1496                 'title': part_title,
1497             })
1498
1499         return {
1500             '_type': 'playlist',
1501             'entries': entries,
1502             'title': query,
1503         }
1504
1505
1506 class YoutubeShowIE(InfoExtractor):
1507     IE_DESC = 'YouTube.com (multi-season) shows'
1508     _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1509     IE_NAME = 'youtube:show'
1510     _TESTS = [{
1511         'url': 'http://www.youtube.com/show/airdisasters',
1512         'playlist_mincount': 3,
1513         'info_dict': {
1514             'id': 'airdisasters',
1515             'title': 'Air Disasters',
1516         }
1517     }]
1518
1519     def _real_extract(self, url):
1520         mobj = re.match(self._VALID_URL, url)
1521         playlist_id = mobj.group('id')
1522         webpage = self._download_webpage(
1523             url, playlist_id, 'Downloading show webpage')
1524         # There's one playlist for each season of the show
1525         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1526         self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1527         entries = [
1528             self.url_result(
1529                 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1530             for season in m_seasons
1531         ]
1532         title = self._og_search_title(webpage, fatal=False)
1533
1534         return {
1535             '_type': 'playlist',
1536             'id': playlist_id,
1537             'title': title,
1538             'entries': entries,
1539         }
1540
1541
1542 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1543     """
1544     Base class for extractors that fetch info from
1545     http://www.youtube.com/feed_ajax
1546     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1547     """
1548     _LOGIN_REQUIRED = True
1549     # use action_load_personal_feed instead of action_load_system_feed
1550     _PERSONAL_FEED = False
1551
1552     @property
1553     def _FEED_TEMPLATE(self):
1554         action = 'action_load_system_feed'
1555         if self._PERSONAL_FEED:
1556             action = 'action_load_personal_feed'
1557         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1558
1559     @property
1560     def IE_NAME(self):
1561         return 'youtube:%s' % self._FEED_NAME
1562
1563     def _real_initialize(self):
1564         self._login()
1565
1566     def _real_extract(self, url):
1567         feed_entries = []
1568         paging = 0
1569         for i in itertools.count(1):
1570             info = self._download_json(
1571                 self._FEED_TEMPLATE % paging,
1572                 '%s feed' % self._FEED_NAME,
1573                 'Downloading page %s' % i,
1574                 transform_source=uppercase_escape)
1575             feed_html = info.get('feed_html') or info.get('content_html')
1576             load_more_widget_html = info.get('load_more_widget_html') or feed_html
1577             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1578             ids = orderedSet(m.group(1) for m in m_ids)
1579             feed_entries.extend(
1580                 self.url_result(video_id, 'Youtube', video_id=video_id)
1581                 for video_id in ids)
1582             mobj = re.search(
1583                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1584                 load_more_widget_html)
1585             if mobj is None:
1586                 break
1587             paging = mobj.group('paging')
1588         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1589
1590
1591 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1592     IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1593     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1594     _FEED_NAME = 'recommended'
1595     _PLAYLIST_TITLE = 'Youtube Recommended videos'
1596
1597
1598 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1599     IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1600     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1601     _FEED_NAME = 'watch_later'
1602     _PLAYLIST_TITLE = 'Youtube Watch Later'
1603     _PERSONAL_FEED = True
1604
1605
1606 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1607     IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1608     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1609     _FEED_NAME = 'history'
1610     _PERSONAL_FEED = True
1611     _PLAYLIST_TITLE = 'Youtube Watch History'
1612
1613
1614 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1615     IE_NAME = 'youtube:favorites'
1616     IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1617     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1618     _LOGIN_REQUIRED = True
1619
1620     def _real_extract(self, url):
1621         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1622         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1623         return self.url_result(playlist_id, 'YoutubePlaylist')
1624
1625
1626 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1627     IE_NAME = 'youtube:subscriptions'
1628     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1629     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1630     _TESTS = []
1631
1632     def _real_extract(self, url):
1633         title = 'Youtube Subscriptions'
1634         page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1635
1636         # The extraction process is the same as for playlists, but the regex
1637         # for the video ids doesn't contain an index
1638         ids = []
1639         more_widget_html = content_html = page
1640
1641         for page_num in itertools.count(1):
1642             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1643             new_ids = orderedSet(matches)
1644             ids.extend(new_ids)
1645
1646             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1647             if not mobj:
1648                 break
1649
1650             more = self._download_json(
1651                 'https://youtube.com/%s' % mobj.group('more'), title,
1652                 'Downloading page #%s' % page_num,
1653                 transform_source=uppercase_escape)
1654             content_html = more['content_html']
1655             more_widget_html = more['load_more_widget_html']
1656
1657         return {
1658             '_type': 'playlist',
1659             'title': title,
1660             'entries': self._ids_to_results(ids),
1661         }
1662
1663
1664 class YoutubeTruncatedURLIE(InfoExtractor):
1665     IE_NAME = 'youtube:truncated_url'
1666     IE_DESC = False  # Do not list
1667     _VALID_URL = r'''(?x)
1668         (?:https?://)?[^/]+/watch\?(?:
1669             feature=[a-z_]+|
1670             annotation_id=annotation_[^&]+
1671         )?$|
1672         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1673     '''
1674
1675     _TESTS = [{
1676         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1677         'only_matching': True,
1678     }, {
1679         'url': 'http://www.youtube.com/watch?',
1680         'only_matching': True,
1681     }]
1682
1683     def _real_extract(self, url):
1684         raise ExtractorError(
1685             'Did you forget to quote the URL? Remember that & is a meta '
1686             'character in most shells, so you want to put the URL in quotes, '
1687             'like  youtube-dl '
1688             '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1689             ' or simply  youtube-dl BaW_jenozKc  .',
1690             expected=True)
1691
1692
1693 class YoutubeTruncatedIDIE(InfoExtractor):
1694     IE_NAME = 'youtube:truncated_id'
1695     IE_DESC = False  # Do not list
1696     _VALID_URL = r'https?://(?:www\.)youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
1697
1698     _TESTS = [{
1699         'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1700         'only_matching': True,
1701     }]
1702
1703     def _real_extract(self, url):
1704         video_id = self._match_id(url)
1705         raise ExtractorError(
1706             'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1707             expected=True)