git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import time
  11 import traceback
  12
  13 from .common import InfoExtractor, SearchInfoExtractor
  14 from ..jsinterp import JSInterpreter
  15 from ..swfinterp import SWFInterpreter
  16 from ..compat import (
  17     compat_chr,
  18     compat_parse_qs,
  19     compat_urllib_parse,
  20     compat_urllib_parse_unquote,
  21     compat_urllib_parse_unquote_plus,
  22     compat_urllib_parse_urlparse,
  23     compat_urllib_request,
  24     compat_urlparse,
  25     compat_str,
  26 )
  27 from ..utils import (
  28     clean_html,
  29     encode_dict,
  30     ExtractorError,
  31     float_or_none,
  32     get_element_by_attribute,
  33     get_element_by_id,
  34     int_or_none,
  35     orderedSet,
  36     parse_duration,
  37     remove_start,
  38     smuggle_url,
  39     str_to_int,
  40     unescapeHTML,
  41     unified_strdate,
  42     unsmuggle_url,
  43     uppercase_escape,
  44     ISO3166Utils,
  45 )
  46
  47
  48 class YoutubeBaseInfoExtractor(InfoExtractor):
  49     """Provide base functions for Youtube extractors"""
  50     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  51     _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
  52     _NETRC_MACHINE = 'youtube'
  53     # If True it will raise an error if no login info is provided
  54     _LOGIN_REQUIRED = False
  55
  56     def _set_language(self):
  57         self._set_cookie(
  58             '.youtube.com', 'PREF', 'f1=50000000&hl=en',
  59             # YouTube sets the expire time to about two months
  60             expire_time=time.time() + 2 * 30 * 24 * 3600)
  61
  62     def _ids_to_results(self, ids):
  63         return [
  64             self.url_result(vid_id, 'Youtube', video_id=vid_id)
  65             for vid_id in ids]
  66
  67     def _login(self):
  68         """
  69         Attempt to log in to YouTube.
  70         True is returned if successful or skipped.
  71         False is returned if login failed.
  72
  73         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  74         """
  75         (username, password) = self._get_login_info()
  76         # No authentication to be performed
  77         if username is None:
  78             if self._LOGIN_REQUIRED:
  79                 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  80             return True
  81
  82         login_page = self._download_webpage(
  83             self._LOGIN_URL, None,
  84             note='Downloading login page',
  85             errnote='unable to fetch login page', fatal=False)
  86         if login_page is False:
  87             return
  88
  89         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  90                                   login_page, 'Login GALX parameter')
  91
  92         # Log in
  93         login_form_strs = {
  94             'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  95             'Email': username,
  96             'GALX': galx,
  97             'Passwd': password,
  98
  99             'PersistentCookie': 'yes',
 100             '_utf8': '霱',
 101             'bgresponse': 'js_disabled',
 102             'checkConnection': '',
 103             'checkedDomains': 'youtube',
 104             'dnConn': '',
 105             'pstMsg': '0',
 106             'rmShown': '1',
 107             'secTok': '',
 108             'signIn': 'Sign in',
 109             'timeStmp': '',
 110             'service': 'youtube',
 111             'uilel': '3',
 112             'hl': 'en_US',
 113         }
 114
 115         login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')
 116
 117         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 118         login_results = self._download_webpage(
 119             req, None,
 120             note='Logging in', errnote='unable to log in', fatal=False)
 121         if login_results is False:
 122             return False
 123
 124         if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 125             raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 126
 127         # Two-Factor
 128         # TODO add SMS and phone call support - these require making a request and then prompting the user
 129
 130         if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
 131             tfa_code = self._get_tfa_info('2-step verification code')
 132
 133             if not tfa_code:
 134                 self._downloader.report_warning(
 135                     'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
 136                     '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 137                 return False
 138
 139             tfa_code = remove_start(tfa_code, 'G-')
 140
 141             tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
 142
 143             tfa_form_strs.update({
 144                 'Pin': tfa_code,
 145                 'TrustDevice': 'on',
 146             })
 147
 148             tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')
 149
 150             tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 151             tfa_results = self._download_webpage(
 152                 tfa_req, None,
 153                 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
 154
 155             if tfa_results is False:
 156                 return False
 157
 158             if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
 159                 self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
 160                 return False
 161             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 162                 self._downloader.report_warning('unable to log in - did the page structure change?')
 163                 return False
 164             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 165                 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 166                 return False
 167
 168         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 169             self._downloader.report_warning('unable to log in: bad username or password')
 170             return False
 171         return True
 172
 173     def _real_initialize(self):
 174         if self._downloader is None:
 175             return
 176         self._set_language()
 177         if not self._login():
 178             return
 179
 180
 181 class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
 182     # Extract the video ids from the playlist pages
 183     def _entries(self, page, playlist_id):
 184         more_widget_html = content_html = page
 185         for page_num in itertools.count(1):
 186             for video_id, video_title in self.extract_videos_from_page(content_html):
 187                 yield self.url_result(
 188                     video_id, 'Youtube', video_id=video_id,
 189                     video_title=video_title)
 190
 191             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
 192             if not mobj:
 193                 break
 194
 195             more = self._download_json(
 196                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
 197                 'Downloading page #%s' % page_num,
 198                 transform_source=uppercase_escape)
 199             content_html = more['content_html']
 200             if not content_html.strip():
 201                 # Some webpages show a "Load more" button but they don't
 202                 # have more videos
 203                 break
 204             more_widget_html = more['load_more_widget_html']
 205
 206     def extract_videos_from_page(self, page):
 207         ids_in_page = []
 208         titles_in_page = []
 209         for mobj in re.finditer(self._VIDEO_RE, page):
 210             # The link with index 0 is not the first video of the playlist (not sure if still actual)
 211             if 'index' in mobj.groupdict() and mobj.group('id') == '0':
 212                 continue
 213             video_id = mobj.group('id')
 214             video_title = unescapeHTML(mobj.group('title'))
 215             if video_title:
 216                 video_title = video_title.strip()
 217             try:
 218                 idx = ids_in_page.index(video_id)
 219                 if video_title and not titles_in_page[idx]:
 220                     titles_in_page[idx] = video_title
 221             except ValueError:
 222                 ids_in_page.append(video_id)
 223                 titles_in_page.append(video_title)
 224         return zip(ids_in_page, titles_in_page)
 225
 226
 227 class YoutubePlaylistsBaseInfoExtractor(InfoExtractor):
 228     def _real_extract(self, url):
 229         playlist_id = self._match_id(url)
 230         webpage = self._download_webpage(url, playlist_id)
 231         entries = [
 232             self.url_result(compat_urlparse.urljoin(url, playlist), 'YoutubePlaylist')
 233             for playlist in re.findall(r'href="(/playlist\?list=.+?)"', webpage)]
 234         title = self._og_search_title(webpage, fatal=False)
 235         return self.playlist_result(entries, playlist_id, title)
 236
 237
 238 class YoutubeIE(YoutubeBaseInfoExtractor):
 239     IE_DESC = 'YouTube.com'
 240     _VALID_URL = r"""(?x)^
 241                      (
 242                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
 243                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 244                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 245                             (?:www\.)?pwnyoutube\.com/|
 246                             (?:www\.)?yourepeat\.com/|
 247                             tube\.majestyc\.net/|
 248                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 249                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 250                          (?:                                                  # the various things that can precede the ID:
 251                              (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
 252                              |(?:                                             # or the v= param in all its forms
 253                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 254                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 255                                  (?:.*?&)??                                   # any other preceding param (like /?s=tuff&v=xxxx)
 256                                  v=
 257                              )
 258                          ))
 259                          |(?:
 260                             youtu\.be|                                        # just youtu.be/xxxx
 261                             vid\.plus                                         # or vid.plus/xxxx
 262                          )/
 263                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 264                          )
 265                      )?                                                       # all until now is optional -> you can pass the naked ID
 266                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 267                      (?!.*?&list=)                                            # combined list/video URLs are handled by the playlist IE
 268                      (?(1).+)?                                                # if we found the ID, everything can follow
 269                      $"""
 270     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 271     _formats = {
 272         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 273         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 274         '13': {'ext': '3gp'},
 275         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 276         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 277         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 278         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 279         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 280         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 281         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 282         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 283         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 284         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 285         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 286         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 287         '59': {'ext': 'mp4', 'width': 854, 'height': 480},
 288         '78': {'ext': 'mp4', 'width': 854, 'height': 480},
 289
 290
 291         # 3d videos
 292         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 293         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 294         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 295         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 296         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 297         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 298         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 299
 300         # Apple HTTP Live Streaming
 301         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 302         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 303         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 304         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 305         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 306         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 307         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 308
 309         # DASH mp4 video
 310         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 311         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 312         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 313         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 314         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 315         '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},  # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
 316         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 317         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 318         '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 319         '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 320         '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
 321
 322         # Dash mp4 audio
 323         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
 324         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
 325         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
 326
 327         # Dash webm
 328         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
 329         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
 330         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
 331         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
 332         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
 333         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
 334         '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},
 335         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 336         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 337         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 338         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 339         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 340         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 341         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 342         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 343         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 344         '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
 345         '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
 346         '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
 347         '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},
 348         '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
 349
 350         # Dash webm audio
 351         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
 352         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 353
 354         # Dash webm audio with opus inside
 355         '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
 356         '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
 357         '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
 358
 359         # RTMP (unnamed)
 360         '_rtmp': {'protocol': 'rtmp'},
 361     }
 362
 363     IE_NAME = 'youtube'
 364     _TESTS = [
 365         {
 366             'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
 367             'info_dict': {
 368                 'id': 'BaW_jenozKc',
 369                 'ext': 'mp4',
 370                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
 371                 'uploader': 'Philipp Hagemeister',
 372                 'uploader_id': 'phihag',
 373                 'upload_date': '20121002',
 374                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
 375                 'categories': ['Science & Technology'],
 376                 'tags': ['youtube-dl'],
 377                 'like_count': int,
 378                 'dislike_count': int,
 379                 'start_time': 1,
 380                 'end_time': 9,
 381             }
 382         },
 383         {
 384             'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
 385             'note': 'Test generic use_cipher_signature video (#897)',
 386             'info_dict': {
 387                 'id': 'UxxajLWwzqY',
 388                 'ext': 'mp4',
 389                 'upload_date': '20120506',
 390                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
 391                 'description': 'md5:782e8651347686cba06e58f71ab51773',
 392                 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
 393                          'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
 394                          'iconic ep', 'iconic', 'love', 'it'],
 395                 'uploader': 'Icona Pop',
 396                 'uploader_id': 'IconaPop',
 397             }
 398         },
 399         {
 400             'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
 401             'note': 'Test VEVO video with age protection (#956)',
 402             'info_dict': {
 403                 'id': '07FYdnEawAQ',
 404                 'ext': 'mp4',
 405                 'upload_date': '20130703',
 406                 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
 407                 'description': 'md5:64249768eec3bc4276236606ea996373',
 408                 'uploader': 'justintimberlakeVEVO',
 409                 'uploader_id': 'justintimberlakeVEVO',
 410                 'age_limit': 18,
 411             }
 412         },
 413         {
 414             'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
 415             'note': 'Embed-only video (#1746)',
 416             'info_dict': {
 417                 'id': 'yZIXLfi8CZQ',
 418                 'ext': 'mp4',
 419                 'upload_date': '20120608',
 420                 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
 421                 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
 422                 'uploader': 'SET India',
 423                 'uploader_id': 'setindia'
 424             }
 425         },
 426         {
 427             'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
 428             'note': 'Use the first video ID in the URL',
 429             'info_dict': {
 430                 'id': 'BaW_jenozKc',
 431                 'ext': 'mp4',
 432                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
 433                 'uploader': 'Philipp Hagemeister',
 434                 'uploader_id': 'phihag',
 435                 'upload_date': '20121002',
 436                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
 437                 'categories': ['Science & Technology'],
 438                 'tags': ['youtube-dl'],
 439                 'like_count': int,
 440                 'dislike_count': int,
 441             },
 442             'params': {
 443                 'skip_download': True,
 444             },
 445         },
 446         {
 447             'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
 448             'note': '256k DASH audio (format 141) via DASH manifest',
 449             'info_dict': {
 450                 'id': 'a9LDPn-MO4I',
 451                 'ext': 'm4a',
 452                 'upload_date': '20121002',
 453                 'uploader_id': '8KVIDEO',
 454                 'description': '',
 455                 'uploader': '8KVIDEO',
 456                 'title': 'UHDTV TEST 8K VIDEO.mp4'
 457             },
 458             'params': {
 459                 'youtube_include_dash_manifest': True,
 460                 'format': '141',
 461             },
 462         },
 463         # DASH manifest with encrypted signature
 464         {
 465             'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 466             'info_dict': {
 467                 'id': 'IB3lcPjvWLA',
 468                 'ext': 'm4a',
 469                 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
 470                 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
 471                 'uploader': 'AfrojackVEVO',
 472                 'uploader_id': 'AfrojackVEVO',
 473                 'upload_date': '20131011',
 474             },
 475             'params': {
 476                 'youtube_include_dash_manifest': True,
 477                 'format': '141',
 478             },
 479         },
 480         # JS player signature function name containing $
 481         {
 482             'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
 483             'info_dict': {
 484                 'id': 'nfWlot6h_JM',
 485                 'ext': 'm4a',
 486                 'title': 'Taylor Swift - Shake It Off',
 487                 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
 488                 'uploader': 'TaylorSwiftVEVO',
 489                 'uploader_id': 'TaylorSwiftVEVO',
 490                 'upload_date': '20140818',
 491             },
 492             'params': {
 493                 'youtube_include_dash_manifest': True,
 494                 'format': '141',
 495             },
 496         },
 497         # Controversy video
 498         {
 499             'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
 500             'info_dict': {
 501                 'id': 'T4XJQO3qol8',
 502                 'ext': 'mp4',
 503                 'upload_date': '20100909',
 504                 'uploader': 'The Amazing Atheist',
 505                 'uploader_id': 'TheAmazingAtheist',
 506                 'title': 'Burning Everyone\'s Koran',
 507                 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
 508             }
 509         },
 510         # Normal age-gate video (No vevo, embed allowed)
 511         {
 512             'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
 513             'info_dict': {
 514                 'id': 'HtVdAasjOgU',
 515                 'ext': 'mp4',
 516                 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
 517                 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
 518                 'uploader': 'The Witcher',
 519                 'uploader_id': 'WitcherGame',
 520                 'upload_date': '20140605',
 521                 'age_limit': 18,
 522             },
 523         },
 524         # Age-gate video with encrypted signature
 525         {
 526             'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
 527             'info_dict': {
 528                 'id': '6kLq3WMV1nU',
 529                 'ext': 'mp4',
 530                 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
 531                 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
 532                 'uploader': 'LloydVEVO',
 533                 'uploader_id': 'LloydVEVO',
 534                 'upload_date': '20110629',
 535                 'age_limit': 18,
 536             },
 537         },
 538         # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
 539         {
 540             'url': '__2ABJjxzNo',
 541             'info_dict': {
 542                 'id': '__2ABJjxzNo',
 543                 'ext': 'mp4',
 544                 'upload_date': '20100430',
 545                 'uploader_id': 'deadmau5',
 546                 'description': 'md5:12c56784b8032162bb936a5f76d55360',
 547                 'uploader': 'deadmau5',
 548                 'title': 'Deadmau5 - Some Chords (HD)',
 549             },
 550             'expected_warnings': [
 551                 'DASH manifest missing',
 552             ]
 553         },
 554         # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
 555         {
 556             'url': 'lqQg6PlCWgI',
 557             'info_dict': {
 558                 'id': 'lqQg6PlCWgI',
 559                 'ext': 'mp4',
 560                 'upload_date': '20120724',
 561                 'uploader_id': 'olympic',
 562                 'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
 563                 'uploader': 'Olympics',
 564                 'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games',
 565             },
 566             'params': {
 567                 'skip_download': 'requires avconv',
 568             }
 569         },
 570         # Non-square pixels
 571         {
 572             'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
 573             'info_dict': {
 574                 'id': '_b-2C3KPAM0',
 575                 'ext': 'mp4',
 576                 'stretched_ratio': 16 / 9.,
 577                 'upload_date': '20110310',
 578                 'uploader_id': 'AllenMeow',
 579                 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
 580                 'uploader': '孫艾倫',
 581                 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
 582             },
 583         },
 584         # url_encoded_fmt_stream_map is empty string
 585         {
 586             'url': 'qEJwOuvDf7I',
 587             'info_dict': {
 588                 'id': 'qEJwOuvDf7I',
 589                 'ext': 'webm',
 590                 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
 591                 'description': '',
 592                 'upload_date': '20150404',
 593                 'uploader_id': 'spbelect',
 594                 'uploader': 'Наблюдатели Петербурга',
 595             },
 596             'params': {
 597                 'skip_download': 'requires avconv',
 598             }
 599         },
 600         # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
 601         {
 602             'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
 603             'info_dict': {
 604                 'id': 'FIl7x6_3R5Y',
 605                 'ext': 'mp4',
 606                 'title': 'md5:7b81415841e02ecd4313668cde88737a',
 607                 'description': 'md5:116377fd2963b81ec4ce64b542173306',
 608                 'upload_date': '20150625',
 609                 'uploader_id': 'dorappi2000',
 610                 'uploader': 'dorappi2000',
 611                 'formats': 'mincount:33',
 612             },
 613         },
 614         # DASH manifest with segment_list
 615         {
 616             'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
 617             'md5': '8ce563a1d667b599d21064e982ab9e31',
 618             'info_dict': {
 619                 'id': 'CsmdDsKjzN8',
 620                 'ext': 'mp4',
 621                 'upload_date': '20150501',  # According to '<meta itemprop="datePublished"', but in other places it's 20150510
 622                 'uploader': 'Airtek',
 623                 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
 624                 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
 625                 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
 626             },
 627             'params': {
 628                 'youtube_include_dash_manifest': True,
 629                 'format': '135',  # bestvideo
 630             }
 631         },
 632         {
 633             # Multifeed videos (multiple cameras), URL is for Main Camera
 634             'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
 635             'info_dict': {
 636                 'id': 'jqWvoWXjCVs',
 637                 'title': 'teamPGP: Rocket League Noob Stream',
 638                 'description': 'md5:dc7872fb300e143831327f1bae3af010',
 639             },
 640             'playlist': [{
 641                 'info_dict': {
 642                     'id': 'jqWvoWXjCVs',
 643                     'ext': 'mp4',
 644                     'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
 645                     'description': 'md5:dc7872fb300e143831327f1bae3af010',
 646                     'upload_date': '20150721',
 647                     'uploader': 'Beer Games Beer',
 648                     'uploader_id': 'beergamesbeer',
 649                 },
 650             }, {
 651                 'info_dict': {
 652                     'id': '6h8e8xoXJzg',
 653                     'ext': 'mp4',
 654                     'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
 655                     'description': 'md5:dc7872fb300e143831327f1bae3af010',
 656                     'upload_date': '20150721',
 657                     'uploader': 'Beer Games Beer',
 658                     'uploader_id': 'beergamesbeer',
 659                 },
 660             }, {
 661                 'info_dict': {
 662                     'id': 'PUOgX5z9xZw',
 663                     'ext': 'mp4',
 664                     'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
 665                     'description': 'md5:dc7872fb300e143831327f1bae3af010',
 666                     'upload_date': '20150721',
 667                     'uploader': 'Beer Games Beer',
 668                     'uploader_id': 'beergamesbeer',
 669                 },
 670             }, {
 671                 'info_dict': {
 672                     'id': 'teuwxikvS5k',
 673                     'ext': 'mp4',
 674                     'title': 'teamPGP: Rocket League Noob Stream (zim)',
 675                     'description': 'md5:dc7872fb300e143831327f1bae3af010',
 676                     'upload_date': '20150721',
 677                     'uploader': 'Beer Games Beer',
 678                     'uploader_id': 'beergamesbeer',
 679                 },
 680             }],
 681             'params': {
 682                 'skip_download': True,
 683             },
 684         },
 685         {
 686             'url': 'http://vid.plus/FlRa-iH7PGw',
 687             'only_matching': True,
 688         }
 689     ]
 690
 691     def __init__(self, *args, **kwargs):
 692         super(YoutubeIE, self).__init__(*args, **kwargs)
 693         self._player_cache = {}
 694
 695     def report_video_info_webpage_download(self, video_id):
 696         """Report attempt to download video info webpage."""
 697         self.to_screen('%s: Downloading video info webpage' % video_id)
 698
 699     def report_information_extraction(self, video_id):
 700         """Report attempt to extract video information."""
 701         self.to_screen('%s: Extracting video information' % video_id)
 702
 703     def report_unavailable_format(self, video_id, format):
 704         """Report extracted video URL."""
 705         self.to_screen('%s: Format %s not available' % (video_id, format))
 706
 707     def report_rtmp_download(self):
 708         """Indicate the download will use the RTMP protocol."""
 709         self.to_screen('RTMP download detected')
 710
 711     def _signature_cache_id(self, example_sig):
 712         """ Return a string representation of a signature """
 713         return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 714
 715     def _extract_signature_function(self, video_id, player_url, example_sig):
 716         id_m = re.match(
 717             r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',
 718             player_url)
 719         if not id_m:
 720             raise ExtractorError('Cannot identify player %r' % player_url)
 721         player_type = id_m.group('ext')
 722         player_id = id_m.group('id')
 723
 724         # Read from filesystem cache
 725         func_id = '%s_%s_%s' % (
 726             player_type, player_id, self._signature_cache_id(example_sig))
 727         assert os.path.basename(func_id) == func_id
 728
 729         cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
 730         if cache_spec is not None:
 731             return lambda s: ''.join(s[i] for i in cache_spec)
 732
 733         download_note = (
 734             'Downloading player %s' % player_url
 735             if self._downloader.params.get('verbose') else
 736             'Downloading %s player %s' % (player_type, player_id)
 737         )
 738         if player_type == 'js':
 739             code = self._download_webpage(
 740                 player_url, video_id,
 741                 note=download_note,
 742                 errnote='Download of %s failed' % player_url)
 743             res = self._parse_sig_js(code)
 744         elif player_type == 'swf':
 745             urlh = self._request_webpage(
 746                 player_url, video_id,
 747                 note=download_note,
 748                 errnote='Download of %s failed' % player_url)
 749             code = urlh.read()
 750             res = self._parse_sig_swf(code)
 751         else:
 752             assert False, 'Invalid player type %r' % player_type
 753
 754         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 755         cache_res = res(test_string)
 756         cache_spec = [ord(c) for c in cache_res]
 757
 758         self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
 759         return res
 760
 761     def _print_sig_code(self, func, example_sig):
 762         def gen_sig_code(idxs):
 763             def _genslice(start, end, step):
 764                 starts = '' if start == 0 else str(start)
 765                 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
 766                 steps = '' if step == 1 else (':%d' % step)
 767                 return 's[%s%s%s]' % (starts, ends, steps)
 768
 769             step = None
 770             # Quelch pyflakes warnings - start will be set when step is set
 771             start = '(Never used)'
 772             for i, prev in zip(idxs[1:], idxs[:-1]):
 773                 if step is not None:
 774                     if i - prev == step:
 775                         continue
 776                     yield _genslice(start, prev, step)
 777                     step = None
 778                     continue
 779                 if i - prev in [-1, 1]:
 780                     step = i - prev
 781                     start = prev
 782                     continue
 783                 else:
 784                     yield 's[%d]' % prev
 785             if step is None:
 786                 yield 's[%d]' % i
 787             else:
 788                 yield _genslice(start, i, step)
 789
 790         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 791         cache_res = func(test_string)
 792         cache_spec = [ord(c) for c in cache_res]
 793         expr_code = ' + '.join(gen_sig_code(cache_spec))
 794         signature_id_tuple = '(%s)' % (
 795             ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 796         code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 797                 '    return %s\n') % (signature_id_tuple, expr_code)
 798         self.to_screen('Extracted signature function:\n' + code)
 799
 800     def _parse_sig_js(self, jscode):
 801         funcname = self._search_regex(
 802             r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
 803             'Initial JS player signature function name')
 804
 805         jsi = JSInterpreter(jscode)
 806         initial_function = jsi.extract_function(funcname)
 807         return lambda s: initial_function([s])
 808
 809     def _parse_sig_swf(self, file_contents):
 810         swfi = SWFInterpreter(file_contents)
 811         TARGET_CLASSNAME = 'SignatureDecipher'
 812         searched_class = swfi.extract_class(TARGET_CLASSNAME)
 813         initial_function = swfi.extract_function(searched_class, 'decipher')
 814         return lambda s: initial_function([s])
 815
 816     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 817         """Turn the encrypted s field into a working signature"""
 818
 819         if player_url is None:
 820             raise ExtractorError('Cannot decrypt signature without player_url')
 821
 822         if player_url.startswith('//'):
 823             player_url = 'https:' + player_url
 824         try:
 825             player_id = (player_url, self._signature_cache_id(s))
 826             if player_id not in self._player_cache:
 827                 func = self._extract_signature_function(
 828                     video_id, player_url, s
 829                 )
 830                 self._player_cache[player_id] = func
 831             func = self._player_cache[player_id]
 832             if self._downloader.params.get('youtube_print_sig_code'):
 833                 self._print_sig_code(func, s)
 834             return func(s)
 835         except Exception as e:
 836             tb = traceback.format_exc()
 837             raise ExtractorError(
 838                 'Signature extraction failed: ' + tb, cause=e)
 839
 840     def _get_subtitles(self, video_id, webpage):
 841         try:
 842             subs_doc = self._download_xml(
 843                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 844                 video_id, note=False)
 845         except ExtractorError as err:
 846             self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
 847             return {}
 848
 849         sub_lang_list = {}
 850         for track in subs_doc.findall('track'):
 851             lang = track.attrib['lang_code']
 852             if lang in sub_lang_list:
 853                 continue
 854             sub_formats = []
 855             for ext in ['sbv', 'vtt', 'srt']:
 856                 params = compat_urllib_parse.urlencode({
 857                     'lang': lang,
 858                     'v': video_id,
 859                     'fmt': ext,
 860                     'name': track.attrib['name'].encode('utf-8'),
 861                 })
 862                 sub_formats.append({
 863                     'url': 'https://www.youtube.com/api/timedtext?' + params,
 864                     'ext': ext,
 865                 })
 866             sub_lang_list[lang] = sub_formats
 867         if not sub_lang_list:
 868             self._downloader.report_warning('video doesn\'t have subtitles')
 869             return {}
 870         return sub_lang_list
 871
 872     def _get_automatic_captions(self, video_id, webpage):
 873         """We need the webpage for getting the captions url, pass it as an
 874            argument to speed up the process."""
 875         self.to_screen('%s: Looking for automatic captions' % video_id)
 876         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 877         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
 878         if mobj is None:
 879             self._downloader.report_warning(err_msg)
 880             return {}
 881         player_config = json.loads(mobj.group(1))
 882         try:
 883             args = player_config['args']
 884             caption_url = args['ttsurl']
 885             timestamp = args['timestamp']
 886             # We get the available subtitles
 887             list_params = compat_urllib_parse.urlencode({
 888                 'type': 'list',
 889                 'tlangs': 1,
 890                 'asrs': 1,
 891             })
 892             list_url = caption_url + '&' + list_params
 893             caption_list = self._download_xml(list_url, video_id)
 894             original_lang_node = caption_list.find('track')
 895             if original_lang_node is None:
 896                 self._downloader.report_warning('Video doesn\'t have automatic captions')
 897                 return {}
 898             original_lang = original_lang_node.attrib['lang_code']
 899             caption_kind = original_lang_node.attrib.get('kind', '')
 900
 901             sub_lang_list = {}
 902             for lang_node in caption_list.findall('target'):
 903                 sub_lang = lang_node.attrib['lang_code']
 904                 sub_formats = []
 905                 for ext in ['sbv', 'vtt', 'srt']:
 906                     params = compat_urllib_parse.urlencode({
 907                         'lang': original_lang,
 908                         'tlang': sub_lang,
 909                         'fmt': ext,
 910                         'ts': timestamp,
 911                         'kind': caption_kind,
 912                     })
 913                     sub_formats.append({
 914                         'url': caption_url + '&' + params,
 915                         'ext': ext,
 916                     })
 917                 sub_lang_list[sub_lang] = sub_formats
 918             return sub_lang_list
 919         # An extractor error can be raise by the download process if there are
 920         # no automatic captions but there are subtitles
 921         except (KeyError, ExtractorError):
 922             self._downloader.report_warning(err_msg)
 923             return {}
 924
 925     @classmethod
 926     def extract_id(cls, url):
 927         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 928         if mobj is None:
 929             raise ExtractorError('Invalid URL: %s' % url)
 930         video_id = mobj.group(2)
 931         return video_id
 932
 933     def _extract_from_m3u8(self, manifest_url, video_id):
 934         url_map = {}
 935
 936         def _get_urls(_manifest):
 937             lines = _manifest.split('\n')
 938             urls = filter(lambda l: l and not l.startswith('#'),
 939                           lines)
 940             return urls
 941         manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
 942         formats_urls = _get_urls(manifest)
 943         for format_url in formats_urls:
 944             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 945             url_map[itag] = format_url
 946         return url_map
 947
 948     def _extract_annotations(self, video_id):
 949         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 950         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 951
 952     def _parse_dash_manifest(
 953             self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
 954         def decrypt_sig(mobj):
 955             s = mobj.group(1)
 956             dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
 957             return '/signature/%s' % dec_s
 958         dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
 959         dash_doc = self._download_xml(
 960             dash_manifest_url, video_id,
 961             note='Downloading DASH manifest',
 962             errnote='Could not download DASH manifest',
 963             fatal=fatal)
 964
 965         if dash_doc is False:
 966             return []
 967
 968         formats = []
 969         for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
 970             mime_type = a.attrib.get('mimeType')
 971             for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 972                 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 973                 if url_el is None:
 974                     continue
 975                 if mime_type == 'text/vtt':
 976                     # TODO implement WebVTT downloading
 977                     pass
 978                 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
 979                     segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
 980                     format_id = r.attrib['id']
 981                     video_url = url_el.text
 982                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 983                     f = {
 984                         'format_id': format_id,
 985                         'url': video_url,
 986                         'width': int_or_none(r.attrib.get('width')),
 987                         'height': int_or_none(r.attrib.get('height')),
 988                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
 989                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
 990                         'filesize': filesize,
 991                         'fps': int_or_none(r.attrib.get('frameRate')),
 992                     }
 993                     if segment_list is not None:
 994                         f.update({
 995                             'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
 996                             'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
 997                             'protocol': 'http_dash_segments',
 998                         })
 999                     try:
1000                         existing_format = next(
1001                             fo for fo in formats
1002                             if fo['format_id'] == format_id)
1003                     except StopIteration:
1004                         full_info = self._formats.get(format_id, {}).copy()
1005                         full_info.update(f)
1006                         codecs = r.attrib.get('codecs')
1007                         if codecs:
1008                             if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
1009                                 full_info['vcodec'] = codecs
1010                             elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
1011                                 full_info['acodec'] = codecs
1012                         formats.append(full_info)
1013                     else:
1014                         existing_format.update(f)
1015                 else:
1016                     self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1017         return formats
1018
1019     def _real_extract(self, url):
1020         url, smuggled_data = unsmuggle_url(url, {})
1021
1022         proto = (
1023             'http' if self._downloader.params.get('prefer_insecure', False)
1024             else 'https')
1025
1026         start_time = None
1027         end_time = None
1028         parsed_url = compat_urllib_parse_urlparse(url)
1029         for component in [parsed_url.fragment, parsed_url.query]:
1030             query = compat_parse_qs(component)
1031             if start_time is None and 't' in query:
1032                 start_time = parse_duration(query['t'][0])
1033             if start_time is None and 'start' in query:
1034                 start_time = parse_duration(query['start'][0])
1035             if end_time is None and 'end' in query:
1036                 end_time = parse_duration(query['end'][0])
1037
1038         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1039         mobj = re.search(self._NEXT_URL_RE, url)
1040         if mobj:
1041             url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1042         video_id = self.extract_id(url)
1043
1044         # Get video webpage
1045         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1046         video_webpage = self._download_webpage(url, video_id)
1047
1048         # Attempt to extract SWF player URL
1049         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1050         if mobj is not None:
1051             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1052         else:
1053             player_url = None
1054
1055         dash_mpds = []
1056
1057         def add_dash_mpd(video_info):
1058             dash_mpd = video_info.get('dashmpd')
1059             if dash_mpd and dash_mpd[0] not in dash_mpds:
1060                 dash_mpds.append(dash_mpd[0])
1061
1062         # Get video info
1063         embed_webpage = None
1064         is_live = None
1065         if re.search(r'player-age-gate-content">', video_webpage) is not None:
1066             age_gate = True
1067             # We simulate the access to the video from www.youtube.com/v/{video_id}
1068             # this can be viewed without login into Youtube
1069             url = proto + '://www.youtube.com/embed/%s' % video_id
1070             embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1071             data = compat_urllib_parse.urlencode({
1072                 'video_id': video_id,
1073                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1074                 'sts': self._search_regex(
1075                     r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1076             })
1077             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1078             video_info_webpage = self._download_webpage(
1079                 video_info_url, video_id,
1080                 note='Refetching age-gated info webpage',
1081                 errnote='unable to download video info webpage')
1082             video_info = compat_parse_qs(video_info_webpage)
1083             add_dash_mpd(video_info)
1084         else:
1085             age_gate = False
1086             video_info = None
1087             # Try looking directly into the video webpage
1088             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1089             if mobj:
1090                 json_code = uppercase_escape(mobj.group(1))
1091                 ytplayer_config = json.loads(json_code)
1092                 args = ytplayer_config['args']
1093                 if args.get('url_encoded_fmt_stream_map'):
1094                     # Convert to the same format returned by compat_parse_qs
1095                     video_info = dict((k, [v]) for k, v in args.items())
1096                     add_dash_mpd(video_info)
1097                 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1098                     is_live = True
1099             if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1100                 # We also try looking in get_video_info since it may contain different dashmpd
1101                 # URL that points to a DASH manifest with possibly different itag set (some itags
1102                 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1103                 # manifest pointed by get_video_info's dashmpd).
1104                 # The general idea is to take a union of itags of both DASH manifests (for example
1105                 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
1106                 self.report_video_info_webpage_download(video_id)
1107                 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
1108                     video_info_url = (
1109                         '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1110                         % (proto, video_id, el_type))
1111                     video_info_webpage = self._download_webpage(
1112                         video_info_url,
1113                         video_id, note=False,
1114                         errnote='unable to download video info webpage')
1115                     get_video_info = compat_parse_qs(video_info_webpage)
1116                     if get_video_info.get('use_cipher_signature') != ['True']:
1117                         add_dash_mpd(get_video_info)
1118                     if not video_info:
1119                         video_info = get_video_info
1120                     if 'token' in get_video_info:
1121                         # Different get_video_info requests may report different results, e.g.
1122                         # some may report video unavailability, but some may serve it without
1123                         # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
1124                         # the original webpage as well as el=info and el=embedded get_video_info
1125                         # requests report video unavailability due to geo restriction while
1126                         # el=detailpage succeeds and returns valid data). This is probably
1127                         # due to YouTube measures against IP ranges of hosting providers.
1128                         # Working around by preferring the first succeeded video_info containing
1129                         # the token if no such video_info yet was found.
1130                         if 'token' not in video_info:
1131                             video_info = get_video_info
1132                         break
1133         if 'token' not in video_info:
1134             if 'reason' in video_info:
1135                 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1136                     regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
1137                     if regions_allowed:
1138                         raise ExtractorError('YouTube said: This video is available in %s only' % (
1139                             ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
1140                             expected=True)
1141                 raise ExtractorError(
1142                     'YouTube said: %s' % video_info['reason'][0],
1143                     expected=True, video_id=video_id)
1144             else:
1145                 raise ExtractorError(
1146                     '"token" parameter not in video info for unknown reason',
1147                     video_id=video_id)
1148
1149         # title
1150         if 'title' in video_info:
1151             video_title = video_info['title'][0]
1152         else:
1153             self._downloader.report_warning('Unable to extract video title')
1154             video_title = '_'
1155
1156         # description
1157         video_description = get_element_by_id("eow-description", video_webpage)
1158         if video_description:
1159             video_description = re.sub(r'''(?x)
1160                 <a\s+
1161                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1162                     title="([^"]+)"\s+
1163                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1164                     class="yt-uix-redirect-link"\s*>
1165                 [^<]+
1166                 </a>
1167             ''', r'\1', video_description)
1168             video_description = clean_html(video_description)
1169         else:
1170             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1171             if fd_mobj:
1172                 video_description = unescapeHTML(fd_mobj.group(1))
1173             else:
1174                 video_description = ''
1175
1176         if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1177             if not self._downloader.params.get('noplaylist'):
1178                 entries = []
1179                 feed_ids = []
1180                 multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
1181                 for feed in multifeed_metadata_list.split(','):
1182                     feed_data = compat_parse_qs(feed)
1183                     entries.append({
1184                         '_type': 'url_transparent',
1185                         'ie_key': 'Youtube',
1186                         'url': smuggle_url(
1187                             '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1188                             {'force_singlefeed': True}),
1189                         'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1190                     })
1191                     feed_ids.append(feed_data['id'][0])
1192                 self.to_screen(
1193                     'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1194                     % (', '.join(feed_ids), video_id))
1195                 return self.playlist_result(entries, video_id, video_title, video_description)
1196             self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1197
1198         if 'view_count' in video_info:
1199             view_count = int(video_info['view_count'][0])
1200         else:
1201             view_count = None
1202
1203         # Check for "rental" videos
1204         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1205             raise ExtractorError('"rental" videos not supported')
1206
1207         # Start extracting information
1208         self.report_information_extraction(video_id)
1209
1210         # uploader
1211         if 'author' not in video_info:
1212             raise ExtractorError('Unable to extract uploader name')
1213         video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
1214
1215         # uploader_id
1216         video_uploader_id = None
1217         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1218         if mobj is not None:
1219             video_uploader_id = mobj.group(1)
1220         else:
1221             self._downloader.report_warning('unable to extract uploader nickname')
1222
1223         # thumbnail image
1224         # We try first to get a high quality image:
1225         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1226                             video_webpage, re.DOTALL)
1227         if m_thumb is not None:
1228             video_thumbnail = m_thumb.group(1)
1229         elif 'thumbnail_url' not in video_info:
1230             self._downloader.report_warning('unable to extract video thumbnail')
1231             video_thumbnail = None
1232         else:   # don't panic if we can't find it
1233             video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1234
1235         # upload date
1236         upload_date = self._html_search_meta(
1237             'datePublished', video_webpage, 'upload date', default=None)
1238         if not upload_date:
1239             upload_date = self._search_regex(
1240                 [r'(?s)id="eow-date.*?>(.*?)</span>',
1241                  r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1242                 video_webpage, 'upload date', default=None)
1243             if upload_date:
1244                 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1245         upload_date = unified_strdate(upload_date)
1246
1247         m_cat_container = self._search_regex(
1248             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
1249             video_webpage, 'categories', default=None)
1250         if m_cat_container:
1251             category = self._html_search_regex(
1252                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1253                 default=None)
1254             video_categories = None if category is None else [category]
1255         else:
1256             video_categories = None
1257
1258         video_tags = [
1259             unescapeHTML(m.group('content'))
1260             for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1261
1262         def _extract_count(count_name):
1263             return str_to_int(self._search_regex(
1264                 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1265                 % re.escape(count_name),
1266                 video_webpage, count_name, default=None))
1267
1268         like_count = _extract_count('like')
1269         dislike_count = _extract_count('dislike')
1270
1271         # subtitles
1272         video_subtitles = self.extract_subtitles(video_id, video_webpage)
1273         automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
1274
1275         if 'length_seconds' not in video_info:
1276             self._downloader.report_warning('unable to extract video duration')
1277             video_duration = None
1278         else:
1279             video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
1280
1281         # annotations
1282         video_annotations = None
1283         if self._downloader.params.get('writeannotations', False):
1284             video_annotations = self._extract_annotations(video_id)
1285
1286         def _map_to_format_list(urlmap):
1287             formats = []
1288             for itag, video_real_url in urlmap.items():
1289                 dct = {
1290                     'format_id': itag,
1291                     'url': video_real_url,
1292                     'player_url': player_url,
1293                 }
1294                 if itag in self._formats:
1295                     dct.update(self._formats[itag])
1296                 formats.append(dct)
1297             return formats
1298
1299         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1300             self.report_rtmp_download()
1301             formats = [{
1302                 'format_id': '_rtmp',
1303                 'protocol': 'rtmp',
1304                 'url': video_info['conn'][0],
1305                 'player_url': player_url,
1306             }]
1307         elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
1308             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1309             if 'rtmpe%3Dyes' in encoded_url_map:
1310                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1311             formats = []
1312             for url_data_str in encoded_url_map.split(','):
1313                 url_data = compat_parse_qs(url_data_str)
1314                 if 'itag' not in url_data or 'url' not in url_data:
1315                     continue
1316                 format_id = url_data['itag'][0]
1317                 url = url_data['url'][0]
1318
1319                 if 'sig' in url_data:
1320                     url += '&signature=' + url_data['sig'][0]
1321                 elif 's' in url_data:
1322                     encrypted_sig = url_data['s'][0]
1323                     ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1324
1325                     jsplayer_url_json = self._search_regex(
1326                         ASSETS_RE,
1327                         embed_webpage if age_gate else video_webpage,
1328                         'JS player URL (1)', default=None)
1329                     if not jsplayer_url_json and not age_gate:
1330                         # We need the embed website after all
1331                         if embed_webpage is None:
1332                             embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1333                             embed_webpage = self._download_webpage(
1334                                 embed_url, video_id, 'Downloading embed webpage')
1335                         jsplayer_url_json = self._search_regex(
1336                             ASSETS_RE, embed_webpage, 'JS player URL')
1337
1338                     player_url = json.loads(jsplayer_url_json)
1339                     if player_url is None:
1340                         player_url_json = self._search_regex(
1341                             r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1342                             video_webpage, 'age gate player URL')
1343                         player_url = json.loads(player_url_json)
1344
1345                     if self._downloader.params.get('verbose'):
1346                         if player_url is None:
1347                             player_version = 'unknown'
1348                             player_desc = 'unknown'
1349                         else:
1350                             if player_url.endswith('swf'):
1351                                 player_version = self._search_regex(
1352                                     r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1353                                     'flash player', fatal=False)
1354                                 player_desc = 'flash player %s' % player_version
1355                             else:
1356                                 player_version = self._search_regex(
1357                                     [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],
1358                                     player_url,
1359                                     'html5 player', fatal=False)
1360                                 player_desc = 'html5 player %s' % player_version
1361
1362                         parts_sizes = self._signature_cache_id(encrypted_sig)
1363                         self.to_screen('{%s} signature length %s, %s' %
1364                                        (format_id, parts_sizes, player_desc))
1365
1366                     signature = self._decrypt_signature(
1367                         encrypted_sig, video_id, player_url, age_gate)
1368                     url += '&signature=' + signature
1369                 if 'ratebypass' not in url:
1370                     url += '&ratebypass=yes'
1371
1372                 # Some itags are not included in DASH manifest thus corresponding formats will
1373                 # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
1374                 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1375                 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1376                 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
1377                 dct = {
1378                     'format_id': format_id,
1379                     'url': url,
1380                     'player_url': player_url,
1381                     'filesize': int_or_none(url_data.get('clen', [None])[0]),
1382                     'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
1383                     'width': width,
1384                     'height': height,
1385                     'fps': int_or_none(url_data.get('fps', [None])[0]),
1386                     'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],
1387                 }
1388                 type_ = url_data.get('type', [None])[0]
1389                 if type_:
1390                     type_split = type_.split(';')
1391                     kind_ext = type_split[0].split('/')
1392                     if len(kind_ext) == 2:
1393                         kind, ext = kind_ext
1394                         dct['ext'] = ext
1395                         if kind in ('audio', 'video'):
1396                             codecs = None
1397                             for mobj in re.finditer(
1398                                     r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
1399                                 if mobj.group('key') == 'codecs':
1400                                     codecs = mobj.group('val')
1401                                     break
1402                             if codecs:
1403                                 codecs = codecs.split(',')
1404                                 if len(codecs) == 2:
1405                                     acodec, vcodec = codecs[0], codecs[1]
1406                                 else:
1407                                     acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])
1408                                 dct.update({
1409                                     'acodec': acodec,
1410                                     'vcodec': vcodec,
1411                                 })
1412                 if format_id in self._formats:
1413                     dct.update(self._formats[format_id])
1414                 formats.append(dct)
1415         elif video_info.get('hlsvp'):
1416             manifest_url = video_info['hlsvp'][0]
1417             url_map = self._extract_from_m3u8(manifest_url, video_id)
1418             formats = _map_to_format_list(url_map)
1419         else:
1420             raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1421
1422         # Look for the DASH manifest
1423         if self._downloader.params.get('youtube_include_dash_manifest', True):
1424             dash_mpd_fatal = True
1425             for dash_manifest_url in dash_mpds:
1426                 dash_formats = {}
1427                 try:
1428                     for df in self._parse_dash_manifest(
1429                             video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
1430                         # Do not overwrite DASH format found in some previous DASH manifest
1431                         if df['format_id'] not in dash_formats:
1432                             dash_formats[df['format_id']] = df
1433                         # Additional DASH manifests may end up in HTTP Error 403 therefore
1434                         # allow them to fail without bug report message if we already have
1435                         # some DASH manifest succeeded. This is temporary workaround to reduce
1436                         # burst of bug reports until we figure out the reason and whether it
1437                         # can be fixed at all.
1438                         dash_mpd_fatal = False
1439                 except (ExtractorError, KeyError) as e:
1440                     self.report_warning(
1441                         'Skipping DASH manifest: %r' % e, video_id)
1442                 if dash_formats:
1443                     # Remove the formats we found through non-DASH, they
1444                     # contain less info and it can be wrong, because we use
1445                     # fixed values (for example the resolution). See
1446                     # https://github.com/rg3/youtube-dl/issues/5774 for an
1447                     # example.
1448                     formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
1449                     formats.extend(dash_formats.values())
1450
1451         # Check for malformed aspect ratio
1452         stretched_m = re.search(
1453             r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1454             video_webpage)
1455         if stretched_m:
1456             ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1457             for f in formats:
1458                 if f.get('vcodec') != 'none':
1459                     f['stretched_ratio'] = ratio
1460
1461         self._sort_formats(formats)
1462
1463         return {
1464             'id': video_id,
1465             'uploader': video_uploader,
1466             'uploader_id': video_uploader_id,
1467             'upload_date': upload_date,
1468             'title': video_title,
1469             'thumbnail': video_thumbnail,
1470             'description': video_description,
1471             'categories': video_categories,
1472             'tags': video_tags,
1473             'subtitles': video_subtitles,
1474             'automatic_captions': automatic_captions,
1475             'duration': video_duration,
1476             'age_limit': 18 if age_gate else 0,
1477             'annotations': video_annotations,
1478             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1479             'view_count': view_count,
1480             'like_count': like_count,
1481             'dislike_count': dislike_count,
1482             'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
1483             'formats': formats,
1484             'is_live': is_live,
1485             'start_time': start_time,
1486             'end_time': end_time,
1487         }
1488
1489
1490 class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor):
1491     IE_DESC = 'YouTube.com playlists'
1492     _VALID_URL = r"""(?x)(?:
1493                         (?:https?://)?
1494                         (?:\w+\.)?
1495                         youtube\.com/
1496                         (?:
1497                            (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1498                            \? (?:.*?&)*? (?:p|a|list)=
1499                         |  p/
1500                         )
1501                         (
1502                             (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
1503                             # Top tracks, they can also include dots
1504                             |(?:MC)[\w\.]*
1505                         )
1506                         .*
1507                      |
1508                         ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
1509                      )"""
1510     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1511     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
1512     IE_NAME = 'youtube:playlist'
1513     _TESTS = [{
1514         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1515         'info_dict': {
1516             'title': 'ytdl test PL',
1517             'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1518         },
1519         'playlist_count': 3,
1520     }, {
1521         'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1522         'info_dict': {
1523             'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1524             'title': 'YDL_Empty_List',
1525         },
1526         'playlist_count': 0,
1527     }, {
1528         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1529         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1530         'info_dict': {
1531             'title': '29C3: Not my department',
1532             'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1533         },
1534         'playlist_count': 95,
1535     }, {
1536         'note': 'issue #673',
1537         'url': 'PLBB231211A4F62143',
1538         'info_dict': {
1539             'title': '[OLD]Team Fortress 2 (Class-based LP)',
1540             'id': 'PLBB231211A4F62143',
1541         },
1542         'playlist_mincount': 26,
1543     }, {
1544         'note': 'Large playlist',
1545         'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1546         'info_dict': {
1547             'title': 'Uploads from Cauchemar',
1548             'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
1549         },
1550         'playlist_mincount': 799,
1551     }, {
1552         'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1553         'info_dict': {
1554             'title': 'YDL_safe_search',
1555             'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1556         },
1557         'playlist_count': 2,
1558     }, {
1559         'note': 'embedded',
1560         'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1561         'playlist_count': 4,
1562         'info_dict': {
1563             'title': 'JODA15',
1564             'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1565         }
1566     }, {
1567         'note': 'Embedded SWF player',
1568         'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1569         'playlist_count': 4,
1570         'info_dict': {
1571             'title': 'JODA7',
1572             'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
1573         }
1574     }, {
1575         'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1576         'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1577         'info_dict': {
1578             'title': 'Uploads from Interstellar Movie',
1579             'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
1580         },
1581         'playlist_mincout': 21,
1582     }]
1583
1584     def _real_initialize(self):
1585         self._login()
1586
1587     def _extract_mix(self, playlist_id):
1588         # The mixes are generated from a single video
1589         # the id of the playlist is just 'RD' + video_id
1590         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1591         webpage = self._download_webpage(
1592             url, playlist_id, 'Downloading Youtube mix')
1593         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1594         title_span = (
1595             search_title('playlist-title') or
1596             search_title('title long-title') or
1597             search_title('title'))
1598         title = clean_html(title_span)
1599         ids = orderedSet(re.findall(
1600             r'''(?xs)data-video-username=".*?".*?
1601                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1602             webpage))
1603         url_results = self._ids_to_results(ids)
1604
1605         return self.playlist_result(url_results, playlist_id, title)
1606
1607     def _extract_playlist(self, playlist_id):
1608         url = self._TEMPLATE_URL % playlist_id
1609         page = self._download_webpage(url, playlist_id)
1610
1611         for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1612             match = match.strip()
1613             # Check if the playlist exists or is private
1614             if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1615                 raise ExtractorError(
1616                     'The playlist doesn\'t exist or is private, use --username or '
1617                     '--netrc to access it.',
1618                     expected=True)
1619             elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1620                 raise ExtractorError(
1621                     'Invalid parameters. Maybe URL is incorrect.',
1622                     expected=True)
1623             elif re.match(r'[^<]*Choose your language[^<]*', match):
1624                 continue
1625             else:
1626                 self.report_warning('Youtube gives an alert message: ' + match)
1627
1628         playlist_title = self._html_search_regex(
1629             r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
1630             page, 'title')
1631
1632         return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)
1633
1634     def _real_extract(self, url):
1635         # Extract playlist id
1636         mobj = re.match(self._VALID_URL, url)
1637         if mobj is None:
1638             raise ExtractorError('Invalid URL: %s' % url)
1639         playlist_id = mobj.group(1) or mobj.group(2)
1640
1641         # Check if it's a video-specific URL
1642         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1643         if 'v' in query_dict:
1644             video_id = query_dict['v'][0]
1645             if self._downloader.params.get('noplaylist'):
1646                 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1647                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1648             else:
1649                 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1650
1651         if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1652             # Mixes require a custom extraction process
1653             return self._extract_mix(playlist_id)
1654
1655         return self._extract_playlist(playlist_id)
1656
1657
1658 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
1659     IE_DESC = 'YouTube.com channels'
1660     _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1661     _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
1662     _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
1663     IE_NAME = 'youtube:channel'
1664     _TESTS = [{
1665         'note': 'paginated channel',
1666         'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1667         'playlist_mincount': 91,
1668         'info_dict': {
1669             'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
1670             'title': 'Uploads from lex will',
1671         }
1672     }, {
1673         'note': 'Age restricted channel',
1674         # from https://www.youtube.com/user/DeusExOfficial
1675         'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
1676         'playlist_mincount': 64,
1677         'info_dict': {
1678             'id': 'UUs0ifCMCm1icqRbqhUINa0w',
1679             'title': 'Uploads from Deus Ex',
1680         },
1681     }]
1682
1683     def _real_extract(self, url):
1684         channel_id = self._match_id(url)
1685
1686         url = self._TEMPLATE_URL % channel_id
1687
1688         # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1689         # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1690         # otherwise fallback on channel by page extraction
1691         channel_page = self._download_webpage(
1692             url + '?view=57', channel_id,
1693             'Downloading channel page', fatal=False)
1694         if channel_page is False:
1695             channel_playlist_id = False
1696         else:
1697             channel_playlist_id = self._html_search_meta(
1698                 'channelId', channel_page, 'channel id', default=None)
1699             if not channel_playlist_id:
1700                 channel_playlist_id = self._search_regex(
1701                     r'data-(?:channel-external-|yt)id="([^"]+)"',
1702                     channel_page, 'channel id', default=None)
1703         if channel_playlist_id and channel_playlist_id.startswith('UC'):
1704             playlist_id = 'UU' + channel_playlist_id[2:]
1705             return self.url_result(
1706                 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
1707
1708         channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
1709         autogenerated = re.search(r'''(?x)
1710                 class="[^"]*?(?:
1711                     channel-header-autogenerated-label|
1712                     yt-channel-title-autogenerated
1713                 )[^"]*"''', channel_page) is not None
1714
1715         if autogenerated:
1716             # The videos are contained in a single page
1717             # the ajax pages can't be used, they are empty
1718             entries = [
1719                 self.url_result(
1720                     video_id, 'Youtube', video_id=video_id,
1721                     video_title=video_title)
1722                 for video_id, video_title in self.extract_videos_from_page(channel_page)]
1723             return self.playlist_result(entries, channel_id)
1724
1725         return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
1726
1727
1728 class YoutubeUserIE(YoutubeChannelIE):
1729     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1730     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1731     _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
1732     IE_NAME = 'youtube:user'
1733
1734     _TESTS = [{
1735         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1736         'playlist_mincount': 320,
1737         'info_dict': {
1738             'title': 'TheLinuxFoundation',
1739         }
1740     }, {
1741         'url': 'ytuser:phihag',
1742         'only_matching': True,
1743     }]
1744
1745     @classmethod
1746     def suitable(cls, url):
1747         # Don't return True if the url can be extracted with other youtube
1748         # extractor, the regex would is too permissive and it would match.
1749         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1750         if any(ie.suitable(url) for ie in other_ies):
1751             return False
1752         else:
1753             return super(YoutubeUserIE, cls).suitable(url)
1754
1755
1756 class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
1757     IE_DESC = 'YouTube.com user playlists'
1758     _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/user/(?P<id>[^/]+)/playlists'
1759     IE_NAME = 'youtube:user:playlists'
1760
1761     _TEST = {
1762         'url': 'http://www.youtube.com/user/ThirstForScience/playlists',
1763         'playlist_mincount': 4,
1764         'info_dict': {
1765             'id': 'ThirstForScience',
1766             'title': 'Thirst for Science',
1767         },
1768     }
1769
1770
1771 class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
1772     IE_DESC = 'YouTube.com searches'
1773     # there doesn't appear to be a real limit, for example if you search for
1774     # 'python' you get more than 8.000.000 results
1775     _MAX_RESULTS = float('inf')
1776     IE_NAME = 'youtube:search'
1777     _SEARCH_KEY = 'ytsearch'
1778     _EXTRA_QUERY_ARGS = {}
1779     _TESTS = []
1780
1781     def _get_n_results(self, query, n):
1782         """Get a specified number of results for a query"""
1783
1784         videos = []
1785         limit = n
1786
1787         for pagenum in itertools.count(1):
1788             url_query = {
1789                 'search_query': query.encode('utf-8'),
1790                 'page': pagenum,
1791                 'spf': 'navigate',
1792             }
1793             url_query.update(self._EXTRA_QUERY_ARGS)
1794             result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1795             data = self._download_json(
1796                 result_url, video_id='query "%s"' % query,
1797                 note='Downloading page %s' % pagenum,
1798                 errnote='Unable to download API page')
1799             html_content = data[1]['body']['content']
1800
1801             if 'class="search-message' in html_content:
1802                 raise ExtractorError(
1803                     '[youtube] No video results', expected=True)
1804
1805             new_videos = self._ids_to_results(orderedSet(re.findall(
1806                 r'href="/watch\?v=(.{11})', html_content)))
1807             videos += new_videos
1808             if not new_videos or len(videos) > limit:
1809                 break
1810
1811         if len(videos) > n:
1812             videos = videos[:n]
1813         return self.playlist_result(videos, query)
1814
1815
1816 class YoutubeSearchDateIE(YoutubeSearchIE):
1817     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1818     _SEARCH_KEY = 'ytsearchdate'
1819     IE_DESC = 'YouTube.com searches, newest videos first'
1820     _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
1821
1822
1823 class YoutubeSearchURLIE(InfoExtractor):
1824     IE_DESC = 'YouTube.com search URLs'
1825     IE_NAME = 'youtube:search_url'
1826     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1827     _TESTS = [{
1828         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1829         'playlist_mincount': 5,
1830         'info_dict': {
1831             'title': 'youtube-dl test video',
1832         }
1833     }]
1834
1835     def _real_extract(self, url):
1836         mobj = re.match(self._VALID_URL, url)
1837         query = compat_urllib_parse_unquote_plus(mobj.group('query'))
1838
1839         webpage = self._download_webpage(url, query)
1840         result_code = self._search_regex(
1841             r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
1842
1843         part_codes = re.findall(
1844             r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)
1845         entries = []
1846         for part_code in part_codes:
1847             part_title = self._html_search_regex(
1848                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1849             part_url_snippet = self._html_search_regex(
1850                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1851             part_url = compat_urlparse.urljoin(
1852                 'https://www.youtube.com/', part_url_snippet)
1853             entries.append({
1854                 '_type': 'url',
1855                 'url': part_url,
1856                 'title': part_title,
1857             })
1858
1859         return {
1860             '_type': 'playlist',
1861             'entries': entries,
1862             'title': query,
1863         }
1864
1865
1866 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
1867     IE_DESC = 'YouTube.com (multi-season) shows'
1868     _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1869     IE_NAME = 'youtube:show'
1870     _TESTS = [{
1871         'url': 'https://www.youtube.com/show/airdisasters',
1872         'playlist_mincount': 5,
1873         'info_dict': {
1874             'id': 'airdisasters',
1875             'title': 'Air Disasters',
1876         }
1877     }]
1878
1879     def _real_extract(self, url):
1880         playlist_id = self._match_id(url)
1881         return super(YoutubeShowIE, self)._real_extract(
1882             'https://www.youtube.com/show/%s/playlists' % playlist_id)
1883
1884
1885 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1886     """
1887     Base class for feed extractors
1888     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1889     """
1890     _LOGIN_REQUIRED = True
1891
1892     @property
1893     def IE_NAME(self):
1894         return 'youtube:%s' % self._FEED_NAME
1895
1896     def _real_initialize(self):
1897         self._login()
1898
1899     def _real_extract(self, url):
1900         page = self._download_webpage(
1901             'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
1902
1903         # The extraction process is the same as for playlists, but the regex
1904         # for the video ids doesn't contain an index
1905         ids = []
1906         more_widget_html = content_html = page
1907         for page_num in itertools.count(1):
1908             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1909
1910             # 'recommended' feed has infinite 'load more' and each new portion spins
1911             # the same videos in (sometimes) slightly different order, so we'll check
1912             # for unicity and break when portion has no new videos
1913             new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1914             if not new_ids:
1915                 break
1916
1917             ids.extend(new_ids)
1918
1919             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1920             if not mobj:
1921                 break
1922
1923             more = self._download_json(
1924                 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
1925                 'Downloading page #%s' % page_num,
1926                 transform_source=uppercase_escape)
1927             content_html = more['content_html']
1928             more_widget_html = more['load_more_widget_html']
1929
1930         return self.playlist_result(
1931             self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1932
1933
1934 class YoutubeWatchLaterIE(YoutubePlaylistIE):
1935     IE_NAME = 'youtube:watchlater'
1936     IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1937     _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1938
1939     _TESTS = []  # override PlaylistIE tests
1940
1941     def _real_extract(self, url):
1942         return self._extract_playlist('WL')
1943
1944
1945 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1946     IE_NAME = 'youtube:favorites'
1947     IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1948     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1949     _LOGIN_REQUIRED = True
1950
1951     def _real_extract(self, url):
1952         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1953         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1954         return self.url_result(playlist_id, 'YoutubePlaylist')
1955
1956
1957 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1958     IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1959     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1960     _FEED_NAME = 'recommended'
1961     _PLAYLIST_TITLE = 'Youtube Recommended videos'
1962
1963
1964 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1965     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1966     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1967     _FEED_NAME = 'subscriptions'
1968     _PLAYLIST_TITLE = 'Youtube Subscriptions'
1969
1970
1971 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1972     IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1973     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1974     _FEED_NAME = 'history'
1975     _PLAYLIST_TITLE = 'Youtube History'
1976
1977
1978 class YoutubeTruncatedURLIE(InfoExtractor):
1979     IE_NAME = 'youtube:truncated_url'
1980     IE_DESC = False  # Do not list
1981     _VALID_URL = r'''(?x)
1982         (?:https?://)?
1983         (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1984         (?:watch\?(?:
1985             feature=[a-z_]+|
1986             annotation_id=annotation_[^&]+|
1987             x-yt-cl=[0-9]+|
1988             hl=[^&]*|
1989             t=[0-9]+
1990         )?
1991         |
1992             attribution_link\?a=[^&]+
1993         )
1994         $
1995     '''
1996
1997     _TESTS = [{
1998         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1999         'only_matching': True,
2000     }, {
2001         'url': 'http://www.youtube.com/watch?',
2002         'only_matching': True,
2003     }, {
2004         'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
2005         'only_matching': True,
2006     }, {
2007         'url': 'https://www.youtube.com/watch?feature=foo',
2008         'only_matching': True,
2009     }, {
2010         'url': 'https://www.youtube.com/watch?hl=en-GB',
2011         'only_matching': True,
2012     }, {
2013         'url': 'https://www.youtube.com/watch?t=2372',
2014         'only_matching': True,
2015     }]
2016
2017     def _real_extract(self, url):
2018         raise ExtractorError(
2019             'Did you forget to quote the URL? Remember that & is a meta '
2020             'character in most shells, so you want to put the URL in quotes, '
2021             'like  youtube-dl '
2022             '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
2023             ' or simply  youtube-dl BaW_jenozKc  .',
2024             expected=True)
2025
2026
2027 class YoutubeTruncatedIDIE(InfoExtractor):
2028     IE_NAME = 'youtube:truncated_id'
2029     IE_DESC = False  # Do not list
2030     _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
2031
2032     _TESTS = [{
2033         'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
2034         'only_matching': True,
2035     }]
2036
2037     def _real_extract(self, url):
2038         video_id = self._match_id(url)
2039         raise ExtractorError(
2040             'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
2041             expected=True)