_ Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import time
  11 import traceback
  12
  13 from .common import InfoExtractor, SearchInfoExtractor
  14 from .subtitles import SubtitlesInfoExtractor
  15 from ..jsinterp import JSInterpreter
  16 from ..swfinterp import SWFInterpreter
  17 from ..compat import (
  18     compat_chr,
  19     compat_parse_qs,
  20     compat_urllib_parse,
  21     compat_urllib_request,
  22     compat_urlparse,
  23     compat_str,
  24 )
  25 from ..utils import (
  26     clean_html,
  27     ExtractorError,
  28     get_element_by_attribute,
  29     get_element_by_id,
  30     int_or_none,
  31     js_to_json,
  32     OnDemandPagedList,
  33     orderedSet,
  34     unescapeHTML,
  35     unified_strdate,
  36     uppercase_escape,
  37 )
  38
  39
  40 class YoutubeBaseInfoExtractor(InfoExtractor):
  41     """Provide base functions for Youtube extractors"""
  42     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  43     _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
  44     _NETRC_MACHINE = 'youtube'
  45     # If True it will raise an error if no login info is provided
  46     _LOGIN_REQUIRED = False
  47
  48     def _set_language(self):
  49         self._set_cookie(
  50             '.youtube.com', 'PREF', 'f1=50000000&hl=en',
  51             # YouTube sets the expire time to about two months
  52             expire_time=time.time() + 2 * 30 * 24 * 3600)
  53
  54     def _login(self):
  55         """
  56         Attempt to log in to YouTube.
  57         True is returned if successful or skipped.
  58         False is returned if login failed.
  59
  60         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  61         """
  62         (username, password) = self._get_login_info()
  63         # No authentication to be performed
  64         if username is None:
  65             if self._LOGIN_REQUIRED:
  66                 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  67             return True
  68
  69         login_page = self._download_webpage(
  70             self._LOGIN_URL, None,
  71             note='Downloading login page',
  72             errnote='unable to fetch login page', fatal=False)
  73         if login_page is False:
  74             return
  75
  76         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  77                                   login_page, 'Login GALX parameter')
  78
  79         # Log in
  80         login_form_strs = {
  81             'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  82             'Email': username,
  83             'GALX': galx,
  84             'Passwd': password,
  85
  86             'PersistentCookie': 'yes',
  87             '_utf8': '霱',
  88             'bgresponse': 'js_disabled',
  89             'checkConnection': '',
  90             'checkedDomains': 'youtube',
  91             'dnConn': '',
  92             'pstMsg': '0',
  93             'rmShown': '1',
  94             'secTok': '',
  95             'signIn': 'Sign in',
  96             'timeStmp': '',
  97             'service': 'youtube',
  98             'uilel': '3',
  99             'hl': 'en_US',
 100         }
 101
 102         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 103         # chokes on unicode
 104         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
 105         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 106
 107         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 108         login_results = self._download_webpage(
 109             req, None,
 110             note='Logging in', errnote='unable to log in', fatal=False)
 111         if login_results is False:
 112             return False
 113
 114         if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 115             raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 116
 117         # Two-Factor
 118         # TODO add SMS and phone call support - these require making a request and then prompting the user
 119
 120         if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
 121             tfa_code = self._get_tfa_info()
 122
 123             if tfa_code is None:
 124                 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
 125                 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 126                 return False
 127
 128             # Unlike the first login form, secTok and timeStmp are both required for the TFA form
 129
 130             match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 131             if match is None:
 132                 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
 133             secTok = match.group(1)
 134             match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 135             if match is None:
 136                 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
 137             timeStmp = match.group(1)
 138
 139             tfa_form_strs = {
 140                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 141                 'smsToken': '',
 142                 'smsUserPin': tfa_code,
 143                 'smsVerifyPin': 'Verify',
 144
 145                 'PersistentCookie': 'yes',
 146                 'checkConnection': '',
 147                 'checkedDomains': 'youtube',
 148                 'pstMsg': '1',
 149                 'secTok': secTok,
 150                 'timeStmp': timeStmp,
 151                 'service': 'youtube',
 152                 'hl': 'en_US',
 153             }
 154             tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
 155             tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 156
 157             tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 158             tfa_results = self._download_webpage(
 159                 tfa_req, None,
 160                 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
 161
 162             if tfa_results is False:
 163                 return False
 164
 165             if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
 166                 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
 167                 return False
 168             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 169                 self._downloader.report_warning('unable to log in - did the page structure change?')
 170                 return False
 171             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 172                 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 173                 return False
 174
 175         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 176             self._downloader.report_warning('unable to log in: bad username or password')
 177             return False
 178         return True
 179
 180     def _real_initialize(self):
 181         if self._downloader is None:
 182             return
 183         self._set_language()
 184         if not self._login():
 185             return
 186
 187
 188 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 189     IE_DESC = 'YouTube.com'
 190     _VALID_URL = r"""(?x)^
 191                      (
 192                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
 193                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 194                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 195                             (?:www\.)?pwnyoutube\.com/|
 196                             (?:www\.)?yourepeat\.com/|
 197                             tube\.majestyc\.net/|
 198                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 199                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 200                          (?:                                                  # the various things that can precede the ID:
 201                              (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
 202                              |(?:                                             # or the v= param in all its forms
 203                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 204                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 205                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 206                                  v=
 207                              )
 208                          ))
 209                          |youtu\.be/                                          # just youtu.be/xxxx
 210                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 211                          )
 212                      )?                                                       # all until now is optional -> you can pass the naked ID
 213                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 214                      (?!.*?&list=)                                            # combined list/video URLs are handled by the playlist IE
 215                      (?(1).+)?                                                # if we found the ID, everything can follow
 216                      $"""
 217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 218     _formats = {
 219         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 220         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 221         '13': {'ext': '3gp'},
 222         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 223         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 224         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 225         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 226         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 227         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 228         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 229         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 230         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 231         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 232         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 233         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 234
 235
 236         # 3d videos
 237         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 238         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 239         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 240         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 241         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 242         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 243         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 244
 245         # Apple HTTP Live Streaming
 246         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 247         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 248         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 249         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 250         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 251         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 252         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 253
 254         # DASH mp4 video
 255         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 256         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 257         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 258         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 259         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 260         '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},  # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
 261         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 262         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 263         '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 264         '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 265         '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
 266
 267         # Dash mp4 audio
 268         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
 269         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
 270         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
 271
 272         # Dash webm
 273         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 274         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 275         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 276         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 277         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 278         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 279         '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
 280         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 281         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 282         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 283         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 284         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 285         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 286         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 287         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 288         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 289         '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 290         '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 291         '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 292         '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
 293         '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 294
 295         # Dash webm audio
 296         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
 297         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 298
 299         # Dash webm audio with opus inside
 300         '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
 301         '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
 302         '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
 303
 304         # RTMP (unnamed)
 305         '_rtmp': {'protocol': 'rtmp'},
 306     }
 307
 308     IE_NAME = 'youtube'
 309     _TESTS = [
 310         {
 311             'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
 312             'info_dict': {
 313                 'id': 'BaW_jenozKc',
 314                 'ext': 'mp4',
 315                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
 316                 'uploader': 'Philipp Hagemeister',
 317                 'uploader_id': 'phihag',
 318                 'upload_date': '20121002',
 319                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
 320                 'categories': ['Science & Technology'],
 321                 'like_count': int,
 322                 'dislike_count': int,
 323             }
 324         },
 325         {
 326             'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
 327             'note': 'Test generic use_cipher_signature video (#897)',
 328             'info_dict': {
 329                 'id': 'UxxajLWwzqY',
 330                 'ext': 'mp4',
 331                 'upload_date': '20120506',
 332                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
 333                 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
 334                 'uploader': 'Icona Pop',
 335                 'uploader_id': 'IconaPop',
 336             }
 337         },
 338         {
 339             'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
 340             'note': 'Test VEVO video with age protection (#956)',
 341             'info_dict': {
 342                 'id': '07FYdnEawAQ',
 343                 'ext': 'mp4',
 344                 'upload_date': '20130703',
 345                 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
 346                 'description': 'md5:64249768eec3bc4276236606ea996373',
 347                 'uploader': 'justintimberlakeVEVO',
 348                 'uploader_id': 'justintimberlakeVEVO',
 349             }
 350         },
 351         {
 352             'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
 353             'note': 'Embed-only video (#1746)',
 354             'info_dict': {
 355                 'id': 'yZIXLfi8CZQ',
 356                 'ext': 'mp4',
 357                 'upload_date': '20120608',
 358                 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
 359                 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
 360                 'uploader': 'SET India',
 361                 'uploader_id': 'setindia'
 362             }
 363         },
 364         {
 365             'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
 366             'note': '256k DASH audio (format 141) via DASH manifest',
 367             'info_dict': {
 368                 'id': 'a9LDPn-MO4I',
 369                 'ext': 'm4a',
 370                 'upload_date': '20121002',
 371                 'uploader_id': '8KVIDEO',
 372                 'description': '',
 373                 'uploader': '8KVIDEO',
 374                 'title': 'UHDTV TEST 8K VIDEO.mp4'
 375             },
 376             'params': {
 377                 'youtube_include_dash_manifest': True,
 378                 'format': '141',
 379             },
 380         },
 381         # DASH manifest with encrypted signature
 382         {
 383             'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 384             'info_dict': {
 385                 'id': 'IB3lcPjvWLA',
 386                 'ext': 'm4a',
 387                 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
 388                 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
 389                 'uploader': 'AfrojackVEVO',
 390                 'uploader_id': 'AfrojackVEVO',
 391                 'upload_date': '20131011',
 392             },
 393             'params': {
 394                 'youtube_include_dash_manifest': True,
 395                 'format': '141',
 396             },
 397         },
 398         # JS player signature function name containing $
 399         {
 400             'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
 401             'info_dict': {
 402                 'id': 'nfWlot6h_JM',
 403                 'ext': 'm4a',
 404                 'title': 'Taylor Swift - Shake It Off',
 405                 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
 406                 'uploader': 'TaylorSwiftVEVO',
 407                 'uploader_id': 'TaylorSwiftVEVO',
 408                 'upload_date': '20140818',
 409             },
 410             'params': {
 411                 'youtube_include_dash_manifest': True,
 412                 'format': '141',
 413             },
 414         },
 415         # Controversy video
 416         {
 417             'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
 418             'info_dict': {
 419                 'id': 'T4XJQO3qol8',
 420                 'ext': 'mp4',
 421                 'upload_date': '20100909',
 422                 'uploader': 'The Amazing Atheist',
 423                 'uploader_id': 'TheAmazingAtheist',
 424                 'title': 'Burning Everyone\'s Koran',
 425                 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
 426             }
 427         },
 428         # Normal age-gate video (No vevo, embed allowed)
 429         {
 430             'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
 431             'info_dict': {
 432                 'id': 'HtVdAasjOgU',
 433                 'ext': 'mp4',
 434                 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
 435                 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
 436                 'uploader': 'The Witcher',
 437                 'uploader_id': 'WitcherGame',
 438                 'upload_date': '20140605',
 439             },
 440         },
 441         # Age-gate video with encrypted signature
 442         {
 443             'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
 444             'info_dict': {
 445                 'id': '6kLq3WMV1nU',
 446                 'ext': 'mp4',
 447                 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
 448                 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
 449                 'uploader': 'LloydVEVO',
 450                 'uploader_id': 'LloydVEVO',
 451                 'upload_date': '20110629',
 452             },
 453         },
 454         # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
 455         {
 456             'url': '__2ABJjxzNo',
 457             'info_dict': {
 458                 'id': '__2ABJjxzNo',
 459                 'ext': 'mp4',
 460                 'upload_date': '20100430',
 461                 'uploader_id': 'deadmau5',
 462                 'description': 'md5:12c56784b8032162bb936a5f76d55360',
 463                 'uploader': 'deadmau5',
 464                 'title': 'Deadmau5 - Some Chords (HD)',
 465             },
 466             'expected_warnings': [
 467                 'DASH manifest missing',
 468             ]
 469         },
 470         # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
 471         {
 472             'url': 'lqQg6PlCWgI',
 473             'info_dict': {
 474                 'id': 'lqQg6PlCWgI',
 475                 'ext': 'mp4',
 476                 'upload_date': '20120731',
 477                 'uploader_id': 'olympic',
 478                 'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
 479                 'uploader': 'Olympics',
 480                 'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games',
 481             },
 482             'params': {
 483                 'skip_download': 'requires avconv',
 484             }
 485         },
 486         # Non-square pixels
 487         {
 488             'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
 489             'info_dict': {
 490                 'id': '_b-2C3KPAM0',
 491                 'ext': 'mp4',
 492                 'stretched_ratio': 16 / 9.,
 493                 'upload_date': '20110310',
 494                 'uploader_id': 'AllenMeow',
 495                 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
 496                 'uploader': '孫艾倫',
 497                 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
 498             },
 499         }
 500     ]
 501
 502     def __init__(self, *args, **kwargs):
 503         super(YoutubeIE, self).__init__(*args, **kwargs)
 504         self._player_cache = {}
 505
 506     def report_video_info_webpage_download(self, video_id):
 507         """Report attempt to download video info webpage."""
 508         self.to_screen('%s: Downloading video info webpage' % video_id)
 509
 510     def report_information_extraction(self, video_id):
 511         """Report attempt to extract video information."""
 512         self.to_screen('%s: Extracting video information' % video_id)
 513
 514     def report_unavailable_format(self, video_id, format):
 515         """Report extracted video URL."""
 516         self.to_screen('%s: Format %s not available' % (video_id, format))
 517
 518     def report_rtmp_download(self):
 519         """Indicate the download will use the RTMP protocol."""
 520         self.to_screen('RTMP download detected')
 521
 522     def _signature_cache_id(self, example_sig):
 523         """ Return a string representation of a signature """
 524         return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 525
 526     def _extract_signature_function(self, video_id, player_url, example_sig):
 527         id_m = re.match(
 528             r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
 529             player_url)
 530         if not id_m:
 531             raise ExtractorError('Cannot identify player %r' % player_url)
 532         player_type = id_m.group('ext')
 533         player_id = id_m.group('id')
 534
 535         # Read from filesystem cache
 536         func_id = '%s_%s_%s' % (
 537             player_type, player_id, self._signature_cache_id(example_sig))
 538         assert os.path.basename(func_id) == func_id
 539
 540         cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
 541         if cache_spec is not None:
 542             return lambda s: ''.join(s[i] for i in cache_spec)
 543
 544         if player_type == 'js':
 545             code = self._download_webpage(
 546                 player_url, video_id,
 547                 note='Downloading %s player %s' % (player_type, player_id),
 548                 errnote='Download of %s failed' % player_url)
 549             res = self._parse_sig_js(code)
 550         elif player_type == 'swf':
 551             urlh = self._request_webpage(
 552                 player_url, video_id,
 553                 note='Downloading %s player %s' % (player_type, player_id),
 554                 errnote='Download of %s failed' % player_url)
 555             code = urlh.read()
 556             res = self._parse_sig_swf(code)
 557         else:
 558             assert False, 'Invalid player type %r' % player_type
 559
 560         if cache_spec is None:
 561             test_string = ''.join(map(compat_chr, range(len(example_sig))))
 562             cache_res = res(test_string)
 563             cache_spec = [ord(c) for c in cache_res]
 564
 565         self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
 566         return res
 567
 568     def _print_sig_code(self, func, example_sig):
 569         def gen_sig_code(idxs):
 570             def _genslice(start, end, step):
 571                 starts = '' if start == 0 else str(start)
 572                 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
 573                 steps = '' if step == 1 else (':%d' % step)
 574                 return 's[%s%s%s]' % (starts, ends, steps)
 575
 576             step = None
 577             # Quelch pyflakes warnings - start will be set when step is set
 578             start = '(Never used)'
 579             for i, prev in zip(idxs[1:], idxs[:-1]):
 580                 if step is not None:
 581                     if i - prev == step:
 582                         continue
 583                     yield _genslice(start, prev, step)
 584                     step = None
 585                     continue
 586                 if i - prev in [-1, 1]:
 587                     step = i - prev
 588                     start = prev
 589                     continue
 590                 else:
 591                     yield 's[%d]' % prev
 592             if step is None:
 593                 yield 's[%d]' % i
 594             else:
 595                 yield _genslice(start, i, step)
 596
 597         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 598         cache_res = func(test_string)
 599         cache_spec = [ord(c) for c in cache_res]
 600         expr_code = ' + '.join(gen_sig_code(cache_spec))
 601         signature_id_tuple = '(%s)' % (
 602             ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 603         code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 604                 '    return %s\n') % (signature_id_tuple, expr_code)
 605         self.to_screen('Extracted signature function:\n' + code)
 606
 607     def _parse_sig_js(self, jscode):
 608         funcname = self._search_regex(
 609             r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
 610             'Initial JS player signature function name')
 611
 612         jsi = JSInterpreter(jscode)
 613         initial_function = jsi.extract_function(funcname)
 614         return lambda s: initial_function([s])
 615
 616     def _parse_sig_swf(self, file_contents):
 617         swfi = SWFInterpreter(file_contents)
 618         TARGET_CLASSNAME = 'SignatureDecipher'
 619         searched_class = swfi.extract_class(TARGET_CLASSNAME)
 620         initial_function = swfi.extract_function(searched_class, 'decipher')
 621         return lambda s: initial_function([s])
 622
 623     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 624         """Turn the encrypted s field into a working signature"""
 625
 626         if player_url is None:
 627             raise ExtractorError('Cannot decrypt signature without player_url')
 628
 629         if player_url.startswith('//'):
 630             player_url = 'https:' + player_url
 631         try:
 632             player_id = (player_url, self._signature_cache_id(s))
 633             if player_id not in self._player_cache:
 634                 func = self._extract_signature_function(
 635                     video_id, player_url, s
 636                 )
 637                 self._player_cache[player_id] = func
 638             func = self._player_cache[player_id]
 639             if self._downloader.params.get('youtube_print_sig_code'):
 640                 self._print_sig_code(func, s)
 641             return func(s)
 642         except Exception as e:
 643             tb = traceback.format_exc()
 644             raise ExtractorError(
 645                 'Signature extraction failed: ' + tb, cause=e)
 646
 647     def _get_available_subtitles(self, video_id, webpage):
 648         try:
 649             subs_doc = self._download_xml(
 650                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 651                 video_id, note=False)
 652         except ExtractorError as err:
 653             self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
 654             return {}
 655
 656         sub_lang_list = {}
 657         for track in subs_doc.findall('track'):
 658             lang = track.attrib['lang_code']
 659             if lang in sub_lang_list:
 660                 continue
 661             params = compat_urllib_parse.urlencode({
 662                 'lang': lang,
 663                 'v': video_id,
 664                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 665                 'name': track.attrib['name'].encode('utf-8'),
 666             })
 667             url = 'https://www.youtube.com/api/timedtext?' + params
 668             sub_lang_list[lang] = url
 669         if not sub_lang_list:
 670             self._downloader.report_warning('video doesn\'t have subtitles')
 671             return {}
 672         return sub_lang_list
 673
 674     def _get_available_automatic_caption(self, video_id, webpage):
 675         """We need the webpage for getting the captions url, pass it as an
 676            argument to speed up the process."""
 677         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 678         self.to_screen('%s: Looking for automatic captions' % video_id)
 679         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 680         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
 681         if mobj is None:
 682             self._downloader.report_warning(err_msg)
 683             return {}
 684         player_config = json.loads(mobj.group(1))
 685         try:
 686             args = player_config['args']
 687             caption_url = args['ttsurl']
 688             timestamp = args['timestamp']
 689             # We get the available subtitles
 690             list_params = compat_urllib_parse.urlencode({
 691                 'type': 'list',
 692                 'tlangs': 1,
 693                 'asrs': 1,
 694             })
 695             list_url = caption_url + '&' + list_params
 696             caption_list = self._download_xml(list_url, video_id)
 697             original_lang_node = caption_list.find('track')
 698             if original_lang_node is None:
 699                 self._downloader.report_warning('Video doesn\'t have automatic captions')
 700                 return {}
 701             original_lang = original_lang_node.attrib['lang_code']
 702             caption_kind = original_lang_node.attrib.get('kind', '')
 703
 704             sub_lang_list = {}
 705             for lang_node in caption_list.findall('target'):
 706                 sub_lang = lang_node.attrib['lang_code']
 707                 params = compat_urllib_parse.urlencode({
 708                     'lang': original_lang,
 709                     'tlang': sub_lang,
 710                     'fmt': sub_format,
 711                     'ts': timestamp,
 712                     'kind': caption_kind,
 713                 })
 714                 sub_lang_list[sub_lang] = caption_url + '&' + params
 715             return sub_lang_list
 716         # An extractor error can be raise by the download process if there are
 717         # no automatic captions but there are subtitles
 718         except (KeyError, ExtractorError):
 719             self._downloader.report_warning(err_msg)
 720             return {}
 721
 722     @classmethod
 723     def extract_id(cls, url):
 724         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 725         if mobj is None:
 726             raise ExtractorError('Invalid URL: %s' % url)
 727         video_id = mobj.group(2)
 728         return video_id
 729
 730     def _extract_from_m3u8(self, manifest_url, video_id):
 731         url_map = {}
 732
 733         def _get_urls(_manifest):
 734             lines = _manifest.split('\n')
 735             urls = filter(lambda l: l and not l.startswith('#'),
 736                           lines)
 737             return urls
 738         manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
 739         formats_urls = _get_urls(manifest)
 740         for format_url in formats_urls:
 741             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 742             url_map[itag] = format_url
 743         return url_map
 744
 745     def _extract_annotations(self, video_id):
 746         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 747         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 748
 749     def _parse_dash_manifest(
 750             self, video_id, dash_manifest_url, player_url, age_gate):
 751         def decrypt_sig(mobj):
 752             s = mobj.group(1)
 753             dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
 754             return '/signature/%s' % dec_s
 755         dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
 756         dash_doc = self._download_xml(
 757             dash_manifest_url, video_id,
 758             note='Downloading DASH manifest',
 759             errnote='Could not download DASH manifest')
 760
 761         formats = []
 762         for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 763             url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 764             if url_el is None:
 765                 continue
 766             format_id = r.attrib['id']
 767             video_url = url_el.text
 768             filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 769             f = {
 770                 'format_id': format_id,
 771                 'url': video_url,
 772                 'width': int_or_none(r.attrib.get('width')),
 773                 'height': int_or_none(r.attrib.get('height')),
 774                 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
 775                 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
 776                 'filesize': filesize,
 777                 'fps': int_or_none(r.attrib.get('frameRate')),
 778             }
 779             try:
 780                 existing_format = next(
 781                     fo for fo in formats
 782                     if fo['format_id'] == format_id)
 783             except StopIteration:
 784                 f.update(self._formats.get(format_id, {}).items())
 785                 formats.append(f)
 786             else:
 787                 existing_format.update(f)
 788         return formats
 789
 790     def _real_extract(self, url):
 791         proto = (
 792             'http' if self._downloader.params.get('prefer_insecure', False)
 793             else 'https')
 794
 795         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 796         mobj = re.search(self._NEXT_URL_RE, url)
 797         if mobj:
 798             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 799         video_id = self.extract_id(url)
 800
 801         # Get video webpage
 802         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
 803         video_webpage = self._download_webpage(url, video_id)
 804
 805         # Attempt to extract SWF player URL
 806         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 807         if mobj is not None:
 808             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 809         else:
 810             player_url = None
 811
 812         # Get video info
 813         embed_webpage = None
 814         if re.search(r'player-age-gate-content">', video_webpage) is not None:
 815             age_gate = True
 816             # We simulate the access to the video from www.youtube.com/v/{video_id}
 817             # this can be viewed without login into Youtube
 818             url = proto + '://www.youtube.com/embed/%s' % video_id
 819             embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
 820             data = compat_urllib_parse.urlencode({
 821                 'video_id': video_id,
 822                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
 823                 'sts': self._search_regex(
 824                     r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
 825             })
 826             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
 827             video_info_webpage = self._download_webpage(
 828                 video_info_url, video_id,
 829                 note='Refetching age-gated info webpage',
 830                 errnote='unable to download video info webpage')
 831             video_info = compat_parse_qs(video_info_webpage)
 832         else:
 833             age_gate = False
 834             try:
 835                 # Try looking directly into the video webpage
 836                 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
 837                 if not mobj:
 838                     raise ValueError('Could not find ytplayer.config')  # caught below
 839                 json_code = uppercase_escape(mobj.group(1))
 840                 ytplayer_config = json.loads(json_code)
 841                 args = ytplayer_config['args']
 842                 # Convert to the same format returned by compat_parse_qs
 843                 video_info = dict((k, [v]) for k, v in args.items())
 844                 if 'url_encoded_fmt_stream_map' not in args:
 845                     raise ValueError('No stream_map present')  # caught below
 846             except ValueError:
 847                 # We fallback to the get_video_info pages (used by the embed page)
 848                 self.report_video_info_webpage_download(video_id)
 849                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 850                     video_info_url = (
 851                         '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 852                         % (proto, video_id, el_type))
 853                     video_info_webpage = self._download_webpage(
 854                         video_info_url,
 855                         video_id, note=False,
 856                         errnote='unable to download video info webpage')
 857                     video_info = compat_parse_qs(video_info_webpage)
 858                     if 'token' in video_info:
 859                         break
 860         if 'token' not in video_info:
 861             if 'reason' in video_info:
 862                 raise ExtractorError(
 863                     'YouTube said: %s' % video_info['reason'][0],
 864                     expected=True, video_id=video_id)
 865             else:
 866                 raise ExtractorError(
 867                     '"token" parameter not in video info for unknown reason',
 868                     video_id=video_id)
 869
 870         if 'view_count' in video_info:
 871             view_count = int(video_info['view_count'][0])
 872         else:
 873             view_count = None
 874
 875         # Check for "rental" videos
 876         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 877             raise ExtractorError('"rental" videos not supported')
 878
 879         # Start extracting information
 880         self.report_information_extraction(video_id)
 881
 882         # uploader
 883         if 'author' not in video_info:
 884             raise ExtractorError('Unable to extract uploader name')
 885         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 886
 887         # uploader_id
 888         video_uploader_id = None
 889         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 890         if mobj is not None:
 891             video_uploader_id = mobj.group(1)
 892         else:
 893             self._downloader.report_warning('unable to extract uploader nickname')
 894
 895         # title
 896         if 'title' in video_info:
 897             video_title = video_info['title'][0]
 898         else:
 899             self._downloader.report_warning('Unable to extract video title')
 900             video_title = '_'
 901
 902         # thumbnail image
 903         # We try first to get a high quality image:
 904         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
 905                             video_webpage, re.DOTALL)
 906         if m_thumb is not None:
 907             video_thumbnail = m_thumb.group(1)
 908         elif 'thumbnail_url' not in video_info:
 909             self._downloader.report_warning('unable to extract video thumbnail')
 910             video_thumbnail = None
 911         else:   # don't panic if we can't find it
 912             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 913
 914         # upload date
 915         upload_date = None
 916         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
 917         if mobj is None:
 918             mobj = re.search(
 919                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
 920                 video_webpage)
 921         if mobj is not None:
 922             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 923             upload_date = unified_strdate(upload_date)
 924
 925         m_cat_container = self._search_regex(
 926             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
 927             video_webpage, 'categories', default=None)
 928         if m_cat_container:
 929             category = self._html_search_regex(
 930                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
 931                 default=None)
 932             video_categories = None if category is None else [category]
 933         else:
 934             video_categories = None
 935
 936         # description
 937         video_description = get_element_by_id("eow-description", video_webpage)
 938         if video_description:
 939             video_description = re.sub(r'''(?x)
 940                 <a\s+
 941                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 942                     title="([^"]+)"\s+
 943                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 944                     class="yt-uix-redirect-link"\s*>
 945                 [^<]+
 946                 </a>
 947             ''', r'\1', video_description)
 948             video_description = clean_html(video_description)
 949         else:
 950             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 951             if fd_mobj:
 952                 video_description = unescapeHTML(fd_mobj.group(1))
 953             else:
 954                 video_description = ''
 955
 956         def _extract_count(count_name):
 957             count = self._search_regex(
 958                 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
 959                 video_webpage, count_name, default=None)
 960             if count is not None:
 961                 return int(count.replace(',', ''))
 962             return None
 963         like_count = _extract_count('like')
 964         dislike_count = _extract_count('dislike')
 965
 966         # subtitles
 967         video_subtitles = self.extract_subtitles(video_id, video_webpage)
 968
 969         if self._downloader.params.get('listsubtitles', False):
 970             self._list_available_subtitles(video_id, video_webpage)
 971             return
 972
 973         if 'length_seconds' not in video_info:
 974             self._downloader.report_warning('unable to extract video duration')
 975             video_duration = None
 976         else:
 977             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
 978
 979         # annotations
 980         video_annotations = None
 981         if self._downloader.params.get('writeannotations', False):
 982             video_annotations = self._extract_annotations(video_id)
 983
 984         def _map_to_format_list(urlmap):
 985             formats = []
 986             for itag, video_real_url in urlmap.items():
 987                 dct = {
 988                     'format_id': itag,
 989                     'url': video_real_url,
 990                     'player_url': player_url,
 991                 }
 992                 if itag in self._formats:
 993                     dct.update(self._formats[itag])
 994                 formats.append(dct)
 995             return formats
 996
 997         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 998             self.report_rtmp_download()
 999             formats = [{
1000                 'format_id': '_rtmp',
1001                 'protocol': 'rtmp',
1002                 'url': video_info['conn'][0],
1003                 'player_url': player_url,
1004             }]
1005         elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
1006             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1007             if 'rtmpe%3Dyes' in encoded_url_map:
1008                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1009             url_map = {}
1010             for url_data_str in encoded_url_map.split(','):
1011                 url_data = compat_parse_qs(url_data_str)
1012                 if 'itag' not in url_data or 'url' not in url_data:
1013                     continue
1014                 format_id = url_data['itag'][0]
1015                 url = url_data['url'][0]
1016
1017                 if 'sig' in url_data:
1018                     url += '&signature=' + url_data['sig'][0]
1019                 elif 's' in url_data:
1020                     encrypted_sig = url_data['s'][0]
1021                     ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1022
1023                     jsplayer_url_json = self._search_regex(
1024                         ASSETS_RE,
1025                         embed_webpage if age_gate else video_webpage,
1026                         'JS player URL (1)', default=None)
1027                     if not jsplayer_url_json and not age_gate:
1028                         # We need the embed website after all
1029                         if embed_webpage is None:
1030                             embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1031                             embed_webpage = self._download_webpage(
1032                                 embed_url, video_id, 'Downloading embed webpage')
1033                         jsplayer_url_json = self._search_regex(
1034                             ASSETS_RE, embed_webpage, 'JS player URL')
1035
1036                     player_url = json.loads(jsplayer_url_json)
1037                     if player_url is None:
1038                         player_url_json = self._search_regex(
1039                             r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1040                             video_webpage, 'age gate player URL')
1041                         player_url = json.loads(player_url_json)
1042
1043                     if self._downloader.params.get('verbose'):
1044                         if player_url is None:
1045                             player_version = 'unknown'
1046                             player_desc = 'unknown'
1047                         else:
1048                             if player_url.endswith('swf'):
1049                                 player_version = self._search_regex(
1050                                     r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1051                                     'flash player', fatal=False)
1052                                 player_desc = 'flash player %s' % player_version
1053                             else:
1054                                 player_version = self._search_regex(
1055                                     r'html5player-([^/]+?)(?:/html5player)?\.js',
1056                                     player_url,
1057                                     'html5 player', fatal=False)
1058                                 player_desc = 'html5 player %s' % player_version
1059
1060                         parts_sizes = self._signature_cache_id(encrypted_sig)
1061                         self.to_screen('{%s} signature length %s, %s' %
1062                                        (format_id, parts_sizes, player_desc))
1063
1064                     signature = self._decrypt_signature(
1065                         encrypted_sig, video_id, player_url, age_gate)
1066                     url += '&signature=' + signature
1067                 if 'ratebypass' not in url:
1068                     url += '&ratebypass=yes'
1069                 url_map[format_id] = url
1070             formats = _map_to_format_list(url_map)
1071         elif video_info.get('hlsvp'):
1072             manifest_url = video_info['hlsvp'][0]
1073             url_map = self._extract_from_m3u8(manifest_url, video_id)
1074             formats = _map_to_format_list(url_map)
1075         else:
1076             raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1077
1078         # Look for the DASH manifest
1079         if self._downloader.params.get('youtube_include_dash_manifest', True):
1080             dash_mpd = video_info.get('dashmpd')
1081             if dash_mpd:
1082                 dash_manifest_url = dash_mpd[0]
1083                 try:
1084                     dash_formats = self._parse_dash_manifest(
1085                         video_id, dash_manifest_url, player_url, age_gate)
1086                 except (ExtractorError, KeyError) as e:
1087                     self.report_warning(
1088                         'Skipping DASH manifest: %r' % e, video_id)
1089                 else:
1090                     # Hide the formats we found through non-DASH
1091                     dash_keys = set(df['format_id'] for df in dash_formats)
1092                     for f in formats:
1093                         if f['format_id'] in dash_keys:
1094                             f['format_id'] = 'nondash-%s' % f['format_id']
1095                             f['preference'] = f.get('preference', 0) - 10000
1096                     formats.extend(dash_formats)
1097
1098         # Check for malformed aspect ratio
1099         stretched_m = re.search(
1100             r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1101             video_webpage)
1102         if stretched_m:
1103             ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1104             for f in formats:
1105                 if f.get('vcodec') != 'none':
1106                     f['stretched_ratio'] = ratio
1107
1108         self._sort_formats(formats)
1109
1110         return {
1111             'id': video_id,
1112             'uploader': video_uploader,
1113             'uploader_id': video_uploader_id,
1114             'upload_date': upload_date,
1115             'title': video_title,
1116             'thumbnail': video_thumbnail,
1117             'description': video_description,
1118             'categories': video_categories,
1119             'subtitles': video_subtitles,
1120             'duration': video_duration,
1121             'age_limit': 18 if age_gate else 0,
1122             'annotations': video_annotations,
1123             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1124             'view_count': view_count,
1125             'like_count': like_count,
1126             'dislike_count': dislike_count,
1127             'formats': formats,
1128         }
1129
1130
1131 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1132     IE_DESC = 'YouTube.com playlists'
1133     _VALID_URL = r"""(?x)(?:
1134                         (?:https?://)?
1135                         (?:\w+\.)?
1136                         youtube\.com/
1137                         (?:
1138                            (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1139                            \? (?:.*?&)*? (?:p|a|list)=
1140                         |  p/
1141                         )
1142                         (
1143                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1144                             # Top tracks, they can also include dots
1145                             |(?:MC)[\w\.]*
1146                         )
1147                         .*
1148                      |
1149                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1150                      )"""
1151     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1152     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1153     IE_NAME = 'youtube:playlist'
1154     _TESTS = [{
1155         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1156         'info_dict': {
1157             'title': 'ytdl test PL',
1158             'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1159         },
1160         'playlist_count': 3,
1161     }, {
1162         'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1163         'info_dict': {
1164             'title': 'YDL_Empty_List',
1165         },
1166         'playlist_count': 0,
1167     }, {
1168         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1169         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1170         'info_dict': {
1171             'title': '29C3: Not my department',
1172         },
1173         'playlist_count': 95,
1174     }, {
1175         'note': 'issue #673',
1176         'url': 'PLBB231211A4F62143',
1177         'info_dict': {
1178             'title': '[OLD]Team Fortress 2 (Class-based LP)',
1179         },
1180         'playlist_mincount': 26,
1181     }, {
1182         'note': 'Large playlist',
1183         'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1184         'info_dict': {
1185             'title': 'Uploads from Cauchemar',
1186         },
1187         'playlist_mincount': 799,
1188     }, {
1189         'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1190         'info_dict': {
1191             'title': 'YDL_safe_search',
1192         },
1193         'playlist_count': 2,
1194     }, {
1195         'note': 'embedded',
1196         'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1197         'playlist_count': 4,
1198         'info_dict': {
1199             'title': 'JODA15',
1200         }
1201     }, {
1202         'note': 'Embedded SWF player',
1203         'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1204         'playlist_count': 4,
1205         'info_dict': {
1206             'title': 'JODA7',
1207         }
1208     }, {
1209         'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1210         'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1211         'info_dict': {
1212                 'title': 'Uploads from Interstellar Movie',
1213         },
1214         'playlist_mincout': 21,
1215     }]
1216
1217     def _real_initialize(self):
1218         self._login()
1219
1220     def _ids_to_results(self, ids):
1221         return [
1222             self.url_result(vid_id, 'Youtube', video_id=vid_id)
1223             for vid_id in ids]
1224
1225     def _extract_mix(self, playlist_id):
1226         # The mixes are generated from a a single video
1227         # the id of the playlist is just 'RD' + video_id
1228         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1229         webpage = self._download_webpage(
1230             url, playlist_id, 'Downloading Youtube mix')
1231         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1232         title_span = (
1233             search_title('playlist-title') or
1234             search_title('title long-title') or
1235             search_title('title'))
1236         title = clean_html(title_span)
1237         ids = orderedSet(re.findall(
1238             r'''(?xs)data-video-username=".*?".*?
1239                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1240             webpage))
1241         url_results = self._ids_to_results(ids)
1242
1243         return self.playlist_result(url_results, playlist_id, title)
1244
1245     def _real_extract(self, url):
1246         # Extract playlist id
1247         mobj = re.match(self._VALID_URL, url)
1248         if mobj is None:
1249             raise ExtractorError('Invalid URL: %s' % url)
1250         playlist_id = mobj.group(1) or mobj.group(2)
1251
1252         # Check if it's a video-specific URL
1253         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1254         if 'v' in query_dict:
1255             video_id = query_dict['v'][0]
1256             if self._downloader.params.get('noplaylist'):
1257                 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1258                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1259             else:
1260                 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1261
1262         if playlist_id.startswith('RD'):
1263             # Mixes require a custom extraction process
1264             return self._extract_mix(playlist_id)
1265
1266         url = self._TEMPLATE_URL % playlist_id
1267         page = self._download_webpage(url, playlist_id)
1268         more_widget_html = content_html = page
1269
1270         # Check if the playlist exists or is private
1271         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1272             raise ExtractorError(
1273                 'The playlist doesn\'t exist or is private, use --username or '
1274                 '--netrc to access it.',
1275                 expected=True)
1276
1277         # Extract the video ids from the playlist pages
1278         ids = []
1279
1280         for page_num in itertools.count(1):
1281             matches = re.finditer(self._VIDEO_RE, content_html)
1282             # We remove the duplicates and the link with index 0
1283             # (it's not the first video of the playlist)
1284             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1285             ids.extend(new_ids)
1286
1287             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1288             if not mobj:
1289                 break
1290
1291             more = self._download_json(
1292                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1293                 'Downloading page #%s' % page_num,
1294                 transform_source=uppercase_escape)
1295             content_html = more['content_html']
1296             if not content_html.strip():
1297                 # Some webpages show a "Load more" button but they don't
1298                 # have more videos
1299                 break
1300             more_widget_html = more['load_more_widget_html']
1301
1302         playlist_title = self._html_search_regex(
1303             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1304             page, 'title')
1305
1306         url_results = self._ids_to_results(ids)
1307         return self.playlist_result(url_results, playlist_id, playlist_title)
1308
1309
1310 class YoutubeChannelIE(InfoExtractor):
1311     IE_DESC = 'YouTube.com channels'
1312     _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1313     IE_NAME = 'youtube:channel'
1314     _TESTS = [{
1315         'note': 'paginated channel',
1316         'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1317         'playlist_mincount': 91,
1318     }]
1319
1320     def extract_videos_from_page(self, page):
1321         ids_in_page = []
1322         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1323             if mobj.group(1) not in ids_in_page:
1324                 ids_in_page.append(mobj.group(1))
1325         return ids_in_page
1326
1327     def _real_extract(self, url):
1328         channel_id = self._match_id(url)
1329
1330         video_ids = []
1331         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1332         channel_page = self._download_webpage(url, channel_id)
1333         autogenerated = re.search(r'''(?x)
1334                 class="[^"]*?(?:
1335                     channel-header-autogenerated-label|
1336                     yt-channel-title-autogenerated
1337                 )[^"]*"''', channel_page) is not None
1338
1339         if autogenerated:
1340             # The videos are contained in a single page
1341             # the ajax pages can't be used, they are empty
1342             video_ids = self.extract_videos_from_page(channel_page)
1343             entries = [
1344                 self.url_result(video_id, 'Youtube', video_id=video_id)
1345                 for video_id in video_ids]
1346             return self.playlist_result(entries, channel_id)
1347
1348         def _entries():
1349             more_widget_html = content_html = channel_page
1350             for pagenum in itertools.count(1):
1351
1352                 ids_in_page = self.extract_videos_from_page(content_html)
1353                 for video_id in ids_in_page:
1354                     yield self.url_result(
1355                         video_id, 'Youtube', video_id=video_id)
1356
1357                 mobj = re.search(
1358                     r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1359                     more_widget_html)
1360                 if not mobj:
1361                     break
1362
1363                 more = self._download_json(
1364                     'https://youtube.com/%s' % mobj.group('more'), channel_id,
1365                     'Downloading page #%s' % (pagenum + 1),
1366                     transform_source=uppercase_escape)
1367                 content_html = more['content_html']
1368                 more_widget_html = more['load_more_widget_html']
1369
1370         return self.playlist_result(_entries(), channel_id)
1371
1372
1373 class YoutubeUserIE(InfoExtractor):
1374     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1375     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1376     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1377     _GDATA_PAGE_SIZE = 50
1378     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1379     IE_NAME = 'youtube:user'
1380
1381     _TESTS = [{
1382         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1383         'playlist_mincount': 320,
1384         'info_dict': {
1385             'title': 'TheLinuxFoundation',
1386         }
1387     }, {
1388         'url': 'ytuser:phihag',
1389         'only_matching': True,
1390     }]
1391
1392     @classmethod
1393     def suitable(cls, url):
1394         # Don't return True if the url can be extracted with other youtube
1395         # extractor, the regex would is too permissive and it would match.
1396         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1397         if any(ie.suitable(url) for ie in other_ies):
1398             return False
1399         else:
1400             return super(YoutubeUserIE, cls).suitable(url)
1401
1402     def _real_extract(self, url):
1403         username = self._match_id(url)
1404
1405         # Download video ids using YouTube Data API. Result size per
1406         # query is limited (currently to 50 videos) so we need to query
1407         # page by page until there are no video ids - it means we got
1408         # all of them.
1409
1410         def download_page(pagenum):
1411             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1412
1413             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1414             page = self._download_webpage(
1415                 gdata_url, username,
1416                 'Downloading video ids from %d to %d' % (
1417                     start_index, start_index + self._GDATA_PAGE_SIZE))
1418
1419             try:
1420                 response = json.loads(page)
1421             except ValueError as err:
1422                 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1423             if 'entry' not in response['feed']:
1424                 return
1425
1426             # Extract video identifiers
1427             entries = response['feed']['entry']
1428             for entry in entries:
1429                 title = entry['title']['$t']
1430                 video_id = entry['id']['$t'].split('/')[-1]
1431                 yield {
1432                     '_type': 'url',
1433                     'url': video_id,
1434                     'ie_key': 'Youtube',
1435                     'id': video_id,
1436                     'title': title,
1437                 }
1438         url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1439
1440         return self.playlist_result(url_results, playlist_title=username)
1441
1442
1443 class YoutubeSearchIE(SearchInfoExtractor):
1444     IE_DESC = 'YouTube.com searches'
1445     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1446     _MAX_RESULTS = 1000
1447     IE_NAME = 'youtube:search'
1448     _SEARCH_KEY = 'ytsearch'
1449
1450     def _get_n_results(self, query, n):
1451         """Get a specified number of results for a query"""
1452
1453         video_ids = []
1454         pagenum = 0
1455         limit = n
1456         PAGE_SIZE = 50
1457
1458         while (PAGE_SIZE * pagenum) < limit:
1459             result_url = self._API_URL % (
1460                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1461                 (PAGE_SIZE * pagenum) + 1)
1462             data_json = self._download_webpage(
1463                 result_url, video_id='query "%s"' % query,
1464                 note='Downloading page %s' % (pagenum + 1),
1465                 errnote='Unable to download API page')
1466             data = json.loads(data_json)
1467             api_response = data['data']
1468
1469             if 'items' not in api_response:
1470                 raise ExtractorError(
1471                     '[youtube] No video results', expected=True)
1472
1473             new_ids = list(video['id'] for video in api_response['items'])
1474             video_ids += new_ids
1475
1476             limit = min(n, api_response['totalItems'])
1477             pagenum += 1
1478
1479         if len(video_ids) > n:
1480             video_ids = video_ids[:n]
1481         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1482                   for video_id in video_ids]
1483         return self.playlist_result(videos, query)
1484
1485
1486 class YoutubeSearchDateIE(YoutubeSearchIE):
1487     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1488     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1489     _SEARCH_KEY = 'ytsearchdate'
1490     IE_DESC = 'YouTube.com searches, newest videos first'
1491
1492
1493 class YoutubeSearchURLIE(InfoExtractor):
1494     IE_DESC = 'YouTube.com search URLs'
1495     IE_NAME = 'youtube:search_url'
1496     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1497     _TESTS = [{
1498         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1499         'playlist_mincount': 5,
1500         'info_dict': {
1501             'title': 'youtube-dl test video',
1502         }
1503     }]
1504
1505     def _real_extract(self, url):
1506         mobj = re.match(self._VALID_URL, url)
1507         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1508
1509         webpage = self._download_webpage(url, query)
1510         result_code = self._search_regex(
1511             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1512
1513         part_codes = re.findall(
1514             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1515         entries = []
1516         for part_code in part_codes:
1517             part_title = self._html_search_regex(
1518                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1519             part_url_snippet = self._html_search_regex(
1520                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1521             part_url = compat_urlparse.urljoin(
1522                 'https://www.youtube.com/', part_url_snippet)
1523             entries.append({
1524                 '_type': 'url',
1525                 'url': part_url,
1526                 'title': part_title,
1527             })
1528
1529         return {
1530             '_type': 'playlist',
1531             'entries': entries,
1532             'title': query,
1533         }
1534
1535
1536 class YoutubeShowIE(InfoExtractor):
1537     IE_DESC = 'YouTube.com (multi-season) shows'
1538     _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1539     IE_NAME = 'youtube:show'
1540     _TESTS = [{
1541         'url': 'http://www.youtube.com/show/airdisasters',
1542         'playlist_mincount': 3,
1543         'info_dict': {
1544             'id': 'airdisasters',
1545             'title': 'Air Disasters',
1546         }
1547     }]
1548
1549     def _real_extract(self, url):
1550         mobj = re.match(self._VALID_URL, url)
1551         playlist_id = mobj.group('id')
1552         webpage = self._download_webpage(
1553             url, playlist_id, 'Downloading show webpage')
1554         # There's one playlist for each season of the show
1555         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1556         self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1557         entries = [
1558             self.url_result(
1559                 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1560             for season in m_seasons
1561         ]
1562         title = self._og_search_title(webpage, fatal=False)
1563
1564         return {
1565             '_type': 'playlist',
1566             'id': playlist_id,
1567             'title': title,
1568             'entries': entries,
1569         }
1570
1571
1572 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1573     """
1574     Base class for extractors that fetch info from
1575     http://www.youtube.com/feed_ajax
1576     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1577     """
1578     _LOGIN_REQUIRED = True
1579     # use action_load_personal_feed instead of action_load_system_feed
1580     _PERSONAL_FEED = False
1581
1582     @property
1583     def _FEED_TEMPLATE(self):
1584         action = 'action_load_system_feed'
1585         if self._PERSONAL_FEED:
1586             action = 'action_load_personal_feed'
1587         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1588
1589     @property
1590     def IE_NAME(self):
1591         return 'youtube:%s' % self._FEED_NAME
1592
1593     def _real_initialize(self):
1594         self._login()
1595
1596     def _real_extract(self, url):
1597         feed_entries = []
1598         paging = 0
1599         for i in itertools.count(1):
1600             info = self._download_json(
1601                 self._FEED_TEMPLATE % paging,
1602                 '%s feed' % self._FEED_NAME,
1603                 'Downloading page %s' % i,
1604                 transform_source=uppercase_escape)
1605             feed_html = info.get('feed_html') or info.get('content_html')
1606             load_more_widget_html = info.get('load_more_widget_html') or feed_html
1607             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1608             ids = orderedSet(m.group(1) for m in m_ids)
1609             feed_entries.extend(
1610                 self.url_result(video_id, 'Youtube', video_id=video_id)
1611                 for video_id in ids)
1612             mobj = re.search(
1613                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1614                 load_more_widget_html)
1615             if mobj is None:
1616                 break
1617             paging = mobj.group('paging')
1618         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1619
1620
1621 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1622     IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1623     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1624     _FEED_NAME = 'recommended'
1625     _PLAYLIST_TITLE = 'Youtube Recommended videos'
1626
1627
1628 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1629     IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1630     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1631     _FEED_NAME = 'watch_later'
1632     _PLAYLIST_TITLE = 'Youtube Watch Later'
1633     _PERSONAL_FEED = True
1634
1635
1636 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1637     IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1638     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1639     _FEED_NAME = 'history'
1640     _PERSONAL_FEED = True
1641     _PLAYLIST_TITLE = 'Youtube Watch History'
1642
1643
1644 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1645     IE_NAME = 'youtube:favorites'
1646     IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1647     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1648     _LOGIN_REQUIRED = True
1649
1650     def _real_extract(self, url):
1651         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1652         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1653         return self.url_result(playlist_id, 'YoutubePlaylist')
1654
1655
1656 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1657     IE_NAME = 'youtube:subscriptions'
1658     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1659     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1660     _TESTS = []
1661
1662     def _real_extract(self, url):
1663         title = 'Youtube Subscriptions'
1664         page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1665
1666         # The extraction process is the same as for playlists, but the regex
1667         # for the video ids doesn't contain an index
1668         ids = []
1669         more_widget_html = content_html = page
1670
1671         for page_num in itertools.count(1):
1672             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1673             new_ids = orderedSet(matches)
1674             ids.extend(new_ids)
1675
1676             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1677             if not mobj:
1678                 break
1679
1680             more = self._download_json(
1681                 'https://youtube.com/%s' % mobj.group('more'), title,
1682                 'Downloading page #%s' % page_num,
1683                 transform_source=uppercase_escape)
1684             content_html = more['content_html']
1685             more_widget_html = more['load_more_widget_html']
1686
1687         return {
1688             '_type': 'playlist',
1689             'title': title,
1690             'entries': self._ids_to_results(ids),
1691         }
1692
1693
1694 class YoutubeTruncatedURLIE(InfoExtractor):
1695     IE_NAME = 'youtube:truncated_url'
1696     IE_DESC = False  # Do not list
1697     _VALID_URL = r'''(?x)
1698         (?:https?://)?
1699         (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1700         (?:watch\?(?:
1701             feature=[a-z_]+|
1702             annotation_id=annotation_[^&]+|
1703             x-yt-cl=[0-9]+|
1704             hl=[^&]*|
1705         )?
1706         |
1707             attribution_link\?a=[^&]+
1708         )
1709         $
1710     '''
1711
1712     _TESTS = [{
1713         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1714         'only_matching': True,
1715     }, {
1716         'url': 'http://www.youtube.com/watch?',
1717         'only_matching': True,
1718     }, {
1719         'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1720         'only_matching': True,
1721     }, {
1722         'url': 'https://www.youtube.com/watch?feature=foo',
1723         'only_matching': True,
1724     }, {
1725         'url': 'https://www.youtube.com/watch?hl=en-GB',
1726         'only_matching': True,
1727     }]
1728
1729     def _real_extract(self, url):
1730         raise ExtractorError(
1731             'Did you forget to quote the URL? Remember that & is a meta '
1732             'character in most shells, so you want to put the URL in quotes, '
1733             'like  youtube-dl '
1734             '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1735             ' or simply  youtube-dl BaW_jenozKc  .',
1736             expected=True)
1737
1738
1739 class YoutubeTruncatedIDIE(InfoExtractor):
1740     IE_NAME = 'youtube:truncated_id'
1741     IE_DESC = False  # Do not list
1742     _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
1743
1744     _TESTS = [{
1745         'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1746         'only_matching': True,
1747     }]
1748
1749     def _real_extract(self, url):
1750         video_id = self._match_id(url)
1751         raise ExtractorError(
1752             'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1753             expected=True)