[youtube] Add filesystem signature cache
[youtube-dl] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 import collections
4 import itertools
5 import io
6 import json
7 import operator
8 import os.path
9 import re
10 import shutil
11 import socket
12 import string
13 import struct
14 import traceback
15 import zlib
16
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
19 from ..utils import (
20     compat_http_client,
21     compat_parse_qs,
22     compat_urllib_error,
23     compat_urllib_parse,
24     compat_urllib_request,
25     compat_str,
26
27     clean_html,
28     get_element_by_id,
29     ExtractorError,
30     unescapeHTML,
31     unified_strdate,
32     orderedSet,
33 )
34
35 class YoutubeBaseInfoExtractor(InfoExtractor):
36     """Provide base functions for Youtube extractors"""
37     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
38     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
39     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
40     _NETRC_MACHINE = 'youtube'
41     # If True it will raise an error if no login info is provided
42     _LOGIN_REQUIRED = False
43
44     def report_lang(self):
45         """Report attempt to set language."""
46         self.to_screen(u'Setting language')
47
48     def _set_language(self):
49         request = compat_urllib_request.Request(self._LANG_URL)
50         try:
51             self.report_lang()
52             compat_urllib_request.urlopen(request).read()
53         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
54             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
55             return False
56         return True
57
58     def _login(self):
59         (username, password) = self._get_login_info()
60         # No authentication to be performed
61         if username is None:
62             if self._LOGIN_REQUIRED:
63                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
64             return False
65
66         request = compat_urllib_request.Request(self._LOGIN_URL)
67         try:
68             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
69         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
70             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
71             return False
72
73         galx = None
74         dsh = None
75         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
76         if match:
77           galx = match.group(1)
78         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
79         if match:
80           dsh = match.group(1)
81
82         # Log in
83         login_form_strs = {
84                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
85                 u'Email': username,
86                 u'GALX': galx,
87                 u'Passwd': password,
88                 u'PersistentCookie': u'yes',
89                 u'_utf8': u'霱',
90                 u'bgresponse': u'js_disabled',
91                 u'checkConnection': u'',
92                 u'checkedDomains': u'youtube',
93                 u'dnConn': u'',
94                 u'dsh': dsh,
95                 u'pstMsg': u'0',
96                 u'rmShown': u'1',
97                 u'secTok': u'',
98                 u'signIn': u'Sign in',
99                 u'timeStmp': u'',
100                 u'service': u'youtube',
101                 u'uilel': u'3',
102                 u'hl': u'en_US',
103         }
104         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
105         # chokes on unicode
106         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
107         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
108         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
109         try:
110             self.report_login()
111             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
112             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
113                 self._downloader.report_warning(u'unable to log in: bad username or password')
114                 return False
115         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
116             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
117             return False
118         return True
119
120     def _confirm_age(self):
121         age_form = {
122                 'next_url':     '/',
123                 'action_confirm':   'Confirm',
124                 }
125         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
126         try:
127             self.report_age_confirmation()
128             compat_urllib_request.urlopen(request).read().decode('utf-8')
129         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
130             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
131         return True
132
133     def _real_initialize(self):
134         if self._downloader is None:
135             return
136         if not self._set_language():
137             return
138         if not self._login():
139             return
140         self._confirm_age()
141
142
143 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
144     IE_DESC = u'YouTube.com'
145     _VALID_URL = r"""^
146                      (
147                          (?:https?://)?                                       # http(s):// (optional)
148                          (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
149                             tube\.majestyc\.net/|
150                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
151                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
152                          (?:                                                  # the various things that can precede the ID:
153                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
154                              |(?:                                             # or the v= param in all its forms
155                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?)?    # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
156                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
157                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
158                                  v=
159                              )
160                          ))
161                          |youtu\.be/                                          # just youtu.be/xxxx
162                          )
163                      )?                                                       # all until now is optional -> you can pass the naked ID
164                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
165                      (?(1).+)?                                                # if we found the ID, everything can follow
166                      $"""
167     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
168     # Listed in order of quality
169     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
170                           # Apple HTTP Live Streaming
171                           '96', '95', '94', '93', '92', '132', '151',
172                           # 3D
173                           '85', '84', '102', '83', '101', '82', '100',
174                           # Dash video
175                           '138', '137', '248', '136', '247', '135', '246',
176                           '245', '244', '134', '243', '133', '242', '160',
177                           # Dash audio
178                           '141', '172', '140', '171', '139',
179                           ]
180     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
181                                       # Apple HTTP Live Streaming
182                                       '96', '95', '94', '93', '92', '132', '151',
183                                       # 3D
184                                       '85', '102', '84', '101', '83', '100', '82',
185                                       # Dash video
186                                       '138', '248', '137', '247', '136', '246', '245',
187                                       '244', '135', '243', '134', '242', '133', '160',
188                                       # Dash audio
189                                       '172', '141', '171', '140', '139',
190                                       ]
191     _video_formats_map = {
192         'flv': ['35', '34', '6', '5'],
193         '3gp': ['36', '17', '13'],
194         'mp4': ['38', '37', '22', '18'],
195         'webm': ['46', '45', '44', '43'],
196     }
197     _video_extensions = {
198         '13': '3gp',
199         '17': '3gp',
200         '18': 'mp4',
201         '22': 'mp4',
202         '36': '3gp',
203         '37': 'mp4',
204         '38': 'mp4',
205         '43': 'webm',
206         '44': 'webm',
207         '45': 'webm',
208         '46': 'webm',
209
210         # 3d videos
211         '82': 'mp4',
212         '83': 'mp4',
213         '84': 'mp4',
214         '85': 'mp4',
215         '100': 'webm',
216         '101': 'webm',
217         '102': 'webm',
218
219         # Apple HTTP Live Streaming
220         '92': 'mp4',
221         '93': 'mp4',
222         '94': 'mp4',
223         '95': 'mp4',
224         '96': 'mp4',
225         '132': 'mp4',
226         '151': 'mp4',
227
228         # Dash mp4
229         '133': 'mp4',
230         '134': 'mp4',
231         '135': 'mp4',
232         '136': 'mp4',
233         '137': 'mp4',
234         '138': 'mp4',
235         '139': 'mp4',
236         '140': 'mp4',
237         '141': 'mp4',
238         '160': 'mp4',
239
240         # Dash webm
241         '171': 'webm',
242         '172': 'webm',
243         '242': 'webm',
244         '243': 'webm',
245         '244': 'webm',
246         '245': 'webm',
247         '246': 'webm',
248         '247': 'webm',
249         '248': 'webm',
250     }
251     _video_dimensions = {
252         '5': '240x400',
253         '6': '???',
254         '13': '???',
255         '17': '144x176',
256         '18': '360x640',
257         '22': '720x1280',
258         '34': '360x640',
259         '35': '480x854',
260         '36': '240x320',
261         '37': '1080x1920',
262         '38': '3072x4096',
263         '43': '360x640',
264         '44': '480x854',
265         '45': '720x1280',
266         '46': '1080x1920',
267         '82': '360p',
268         '83': '480p',
269         '84': '720p',
270         '85': '1080p',
271         '92': '240p',
272         '93': '360p',
273         '94': '480p',
274         '95': '720p',
275         '96': '1080p',
276         '100': '360p',
277         '101': '480p',
278         '102': '720p',
279         '132': '240p',
280         '151': '72p',
281         '133': '240p',
282         '134': '360p',
283         '135': '480p',
284         '136': '720p',
285         '137': '1080p',
286         '138': '>1080p',
287         '139': '48k',
288         '140': '128k',
289         '141': '256k',
290         '160': '192p',
291         '171': '128k',
292         '172': '256k',
293         '242': '240p',
294         '243': '360p',
295         '244': '480p',
296         '245': '480p',
297         '246': '480p',
298         '247': '720p',
299         '248': '1080p',
300     }
301     _special_itags = {
302         '82': '3D',
303         '83': '3D',
304         '84': '3D',
305         '85': '3D',
306         '100': '3D',
307         '101': '3D',
308         '102': '3D',
309         '133': 'DASH Video',
310         '134': 'DASH Video',
311         '135': 'DASH Video',
312         '136': 'DASH Video',
313         '137': 'DASH Video',
314         '138': 'DASH Video',
315         '139': 'DASH Audio',
316         '140': 'DASH Audio',
317         '141': 'DASH Audio',
318         '160': 'DASH Video',
319         '171': 'DASH Audio',
320         '172': 'DASH Audio',
321         '242': 'DASH Video',
322         '243': 'DASH Video',
323         '244': 'DASH Video',
324         '245': 'DASH Video',
325         '246': 'DASH Video',
326         '247': 'DASH Video',
327         '248': 'DASH Video',
328     }
329
330     IE_NAME = u'youtube'
331     _TESTS = [
332         {
333             u"url":  u"http://www.youtube.com/watch?v=BaW_jenozKc",
334             u"file":  u"BaW_jenozKc.mp4",
335             u"info_dict": {
336                 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
337                 u"uploader": u"Philipp Hagemeister",
338                 u"uploader_id": u"phihag",
339                 u"upload_date": u"20121002",
340                 u"description": u"test chars:  \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
341             }
342         },
343         {
344             u"url":  u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
345             u"file":  u"1ltcDfZMA3U.flv",
346             u"note": u"Test VEVO video (#897)",
347             u"info_dict": {
348                 u"upload_date": u"20070518",
349                 u"title": u"Maps - It Will Find You",
350                 u"description": u"Music video by Maps performing It Will Find You.",
351                 u"uploader": u"MuteUSA",
352                 u"uploader_id": u"MuteUSA"
353             }
354         },
355         {
356             u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
357             u"file":  u"UxxajLWwzqY.mp4",
358             u"note": u"Test generic use_cipher_signature video (#897)",
359             u"info_dict": {
360                 u"upload_date": u"20120506",
361                 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
362                 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
363                 u"uploader": u"Icona Pop",
364                 u"uploader_id": u"IconaPop"
365             }
366         },
367         {
368             u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
369             u"file":  u"07FYdnEawAQ.mp4",
370             u"note": u"Test VEVO video with age protection (#956)",
371             u"info_dict": {
372                 u"upload_date": u"20130703",
373                 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
374                 u"description": u"md5:64249768eec3bc4276236606ea996373",
375                 u"uploader": u"justintimberlakeVEVO",
376                 u"uploader_id": u"justintimberlakeVEVO"
377             }
378         },
379         {
380             u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
381             u'file': u'TGi3HqYrWHE.mp4',
382             u'note': u'm3u8 video',
383             u'info_dict': {
384                 u'title': u'Triathlon - Men - London 2012 Olympic Games',
385                 u'description': u'- Men -  TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
386                 u'uploader': u'olympic',
387                 u'upload_date': u'20120807',
388                 u'uploader_id': u'olympic',
389             },
390             u'params': {
391                 u'skip_download': True,
392             },
393         },
394     ]
395
396
397     @classmethod
398     def suitable(cls, url):
399         """Receives a URL and returns True if suitable for this IE."""
400         if YoutubePlaylistIE.suitable(url): return False
401         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
402
403     def __init__(self, *args, **kwargs):
404         super(YoutubeIE, self).__init__(*args, **kwargs)
405         self._player_cache = {}
406
407     def report_video_webpage_download(self, video_id):
408         """Report attempt to download video webpage."""
409         self.to_screen(u'%s: Downloading video webpage' % video_id)
410
411     def report_video_info_webpage_download(self, video_id):
412         """Report attempt to download video info webpage."""
413         self.to_screen(u'%s: Downloading video info webpage' % video_id)
414
415     def report_information_extraction(self, video_id):
416         """Report attempt to extract video information."""
417         self.to_screen(u'%s: Extracting video information' % video_id)
418
419     def report_unavailable_format(self, video_id, format):
420         """Report extracted video URL."""
421         self.to_screen(u'%s: Format %s not available' % (video_id, format))
422
423     def report_rtmp_download(self):
424         """Indicate the download will use the RTMP protocol."""
425         self.to_screen(u'RTMP download detected')
426
427     def _extract_signature_function(self, video_id, player_url, slen):
428         id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
429                         player_url)
430         player_type = id_m.group('ext')
431         player_id = id_m.group('id')
432
433         # Read from filesystem cache
434         func_id = '%s_%s_%d' % (player_type, player_id, slen)
435         assert os.path.basename(func_id) == func_id
436         cache_dir = self.downloader.params.get('cachedir',
437                                                u'~/.youtube-dl/cache')
438
439         if cache_dir is not False:
440             cache_fn = os.path.join(os.path.expanduser(cache_dir),
441                                     u'youtube-sigfuncs',
442                                     func_id + '.json')
443             try:
444                 with io.open(cache_fn, '', encoding='utf-8') as cachef:
445                     cache_spec = json.load(cachef)
446                 return lambda s: u''.join(s[i] for i in cache_spec)
447             except OSError:
448                 pass  # No cache available
449
450         if player_type == 'js':
451             code = self._download_webpage(
452                 player_url, video_id,
453                 note=u'Downloading %s player %s' % (player_type, player_id),
454                 errnote=u'Download of %s failed' % player_url)
455             res = self._parse_sig_js(code)
456         elif player_type == 'swf':
457             urlh = self._request_webpage(
458                 player_url, video_id,
459                 note=u'Downloading %s player %s' % (player_type, player_id),
460                 errnote=u'Download of %s failed' % player_url)
461             code = urlh.read()
462             res = self._parse_sig_swf(code)
463         else:
464             assert False, 'Invalid player type %r' % player_type
465
466         if cache_dir is not False:
467             cache_res = res(map(compat_chr, range(slen)))
468             cache_spec = [ord(c) for c in cache_res]
469             shutil.makedirs(os.path.dirname(cache_fn))
470             write_json_file(cache_spec, cache_fn)
471
472         return res
473
474     def _parse_sig_js(self, jscode):
475         funcname = self._search_regex(
476             r'signature=([a-zA-Z]+)', jscode,
477             u'Initial JS player signature function name')
478
479         functions = {}
480
481         def argidx(varname):
482             return string.lowercase.index(varname)
483
484         def interpret_statement(stmt, local_vars, allow_recursion=20):
485             if allow_recursion < 0:
486                 raise ExctractorError(u'Recursion limit reached')
487
488             if stmt.startswith(u'var '):
489                 stmt = stmt[len(u'var '):]
490             ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
491                              r'=(?P<expr>.*)$', stmt)
492             if ass_m:
493                 if ass_m.groupdict().get('index'):
494                     def assign(val):
495                         lvar = local_vars[ass_m.group('out')]
496                         idx = interpret_expression(ass_m.group('index'),
497                                                    local_vars, allow_recursion)
498                         assert isinstance(idx, int)
499                         lvar[idx] = val
500                         return val
501                     expr = ass_m.group('expr')
502                 else:
503                     def assign(val):
504                         local_vars[ass_m.group('out')] = val
505                         return val
506                     expr = ass_m.group('expr')
507             elif stmt.startswith(u'return '):
508                 assign = lambda v: v
509                 expr = stmt[len(u'return '):]
510             else:
511                 raise ExtractorError(
512                     u'Cannot determine left side of statement in %r' % stmt)
513
514             v = interpret_expression(expr, local_vars, allow_recursion)
515             return assign(v)
516
517         def interpret_expression(expr, local_vars, allow_recursion):
518             if expr.isdigit():
519                 return int(expr)
520
521             if expr.isalpha():
522                 return local_vars[expr]
523
524             m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
525             if m:
526                 member = m.group('member')
527                 val = local_vars[m.group('in')]
528                 if member == 'split("")':
529                     return list(val)
530                 if member == 'join("")':
531                     return u''.join(val)
532                 if member == 'length':
533                     return len(val)
534                 if member == 'reverse()':
535                     return val[::-1]
536                 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
537                 if slice_m:
538                     idx = interpret_expression(
539                         slice_m.group('idx'), local_vars, allow_recursion-1)
540                     return val[idx:]
541
542             m = re.match(
543                 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
544             if m:
545                 val = local_vars[m.group('in')]
546                 idx = interpret_expression(m.group('idx'), local_vars,
547                                            allow_recursion-1)
548                 return val[idx]
549
550             m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
551             if m:
552                 a = interpret_expression(m.group('a'),
553                                          local_vars, allow_recursion)
554                 b = interpret_expression(m.group('b'),
555                                          local_vars, allow_recursion)
556                 return a % b
557
558             m = re.match(
559                 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
560             if m:
561                 fname = m.group('func')
562                 if fname not in functions:
563                     functions[fname] = extract_function(fname)
564                 argvals = [int(v) if v.isdigit() else local_vars[v]
565                            for v in m.group('args').split(',')]
566                 return functions[fname](argvals)
567             raise ExtractorError(u'Unsupported JS expression %r' % expr)
568
569         def extract_function(funcname):
570             func_m = re.search(
571                 r'function ' + re.escape(funcname) +
572                 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
573                 jscode)
574             argnames = func_m.group('args').split(',')
575
576             def resf(args):
577                 local_vars = dict(zip(argnames, args))
578                 for stmt in func_m.group('code').split(';'):
579                     res = interpret_statement(stmt, local_vars)
580                 return res
581             return resf
582
583         initial_function = extract_function(funcname)
584         return lambda s: initial_function([s])
585
586     def _parse_sig_swf(self, file_contents):
587         if file_contents[1:3] != b'WS':
588             raise ExtractorError(
589                 u'Not an SWF file; header is %r' % file_contents[:3])
590         if file_contents[:1] == b'C':
591             content = zlib.decompress(file_contents[8:])
592         else:
593             raise NotImplementedError(u'Unsupported compression format %r' %
594                                       file_contents[:1])
595
596         def extract_tags(content):
597             pos = 0
598             while pos < len(content):
599                 header16 = struct.unpack('<H', content[pos:pos+2])[0]
600                 pos += 2
601                 tag_code = header16 >> 6
602                 tag_len = header16 & 0x3f
603                 if tag_len == 0x3f:
604                     tag_len = struct.unpack('<I', content[pos:pos+4])[0]
605                     pos += 4
606                 assert pos+tag_len <= len(content)
607                 yield (tag_code, content[pos:pos+tag_len])
608                 pos += tag_len
609
610         code_tag = next(tag
611                         for tag_code, tag in extract_tags(content)
612                         if tag_code == 82)
613         p = code_tag.index(b'\0', 4) + 1
614         code_reader = io.BytesIO(code_tag[p:])
615
616         # Parse ABC (AVM2 ByteCode)
617         def read_int(reader=None):
618             if reader is None:
619                 reader = code_reader
620             res = 0
621             shift = 0
622             for _ in range(5):
623                 buf = reader.read(1)
624                 assert len(buf) == 1
625                 b = struct.unpack('<B', buf)[0]
626                 res = res | ((b & 0x7f) << shift)
627                 if b & 0x80 == 0:
628                     break
629                 shift += 7
630             return res
631
632         def u30(reader=None):
633             res = read_int(reader)
634             assert res & 0xf0000000 == 0
635             return res
636         u32 = read_int
637
638         def s32(reader=None):
639             v = read_int(reader)
640             if v & 0x80000000 != 0:
641                 v = - ((v ^ 0xffffffff) + 1)
642             return v
643
644         def string(reader=None):
645             if reader is None:
646                 reader = code_reader
647             slen = u30(reader)
648             resb = reader.read(slen)
649             assert len(resb) == slen
650             return resb.decode('utf-8')
651
652         def read_bytes(count, reader=None):
653             if reader is None:
654                 reader = code_reader
655             resb = reader.read(count)
656             assert len(resb) == count
657             return resb
658
659         def read_byte(reader=None):
660             resb = read_bytes(1, reader=reader)
661             res = struct.unpack('<B', resb)[0]
662             return res
663
664         # minor_version + major_version
665         _ = read_bytes(2 + 2)
666
667         # Constant pool
668         int_count = u30()
669         for _c in range(1, int_count):
670             _ = s32()
671         uint_count = u30()
672         for _c in range(1, uint_count):
673             _ = u32()
674         double_count = u30()
675         _ = read_bytes((double_count-1) * 8)
676         string_count = u30()
677         constant_strings = [u'']
678         for _c in range(1, string_count):
679             s = string()
680             constant_strings.append(s)
681         namespace_count = u30()
682         for _c in range(1, namespace_count):
683             _ = read_bytes(1)  # kind
684             _ = u30()  # name
685         ns_set_count = u30()
686         for _c in range(1, ns_set_count):
687             count = u30()
688             for _c2 in range(count):
689                 _ = u30()
690         multiname_count = u30()
691         MULTINAME_SIZES = {
692             0x07: 2,  # QName
693             0x0d: 2,  # QNameA
694             0x0f: 1,  # RTQName
695             0x10: 1,  # RTQNameA
696             0x11: 0,  # RTQNameL
697             0x12: 0,  # RTQNameLA
698             0x09: 2,  # Multiname
699             0x0e: 2,  # MultinameA
700             0x1b: 1,  # MultinameL
701             0x1c: 1,  # MultinameLA
702         }
703         multinames = [u'']
704         for _c in range(1, multiname_count):
705             kind = u30()
706             assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
707             if kind == 0x07:
708                 namespace_idx = u30()
709                 name_idx = u30()
710                 multinames.append(constant_strings[name_idx])
711             else:
712                 multinames.append('[MULTINAME kind: %d]' % kind)
713                 for _c2 in range(MULTINAME_SIZES[kind]):
714                     _ = u30()
715
716         # Methods
717         method_count = u30()
718         MethodInfo = collections.namedtuple(
719             'MethodInfo',
720             ['NEED_ARGUMENTS', 'NEED_REST'])
721         method_infos = []
722         for method_id in range(method_count):
723             param_count = u30()
724             _ = u30()  # return type
725             for _ in range(param_count):
726                 _ = u30()  # param type
727             _ = u30()  # name index (always 0 for youtube)
728             flags = read_byte()
729             if flags & 0x08 != 0:
730                 # Options present
731                 option_count = u30()
732                 for c in range(option_count):
733                     _ = u30()  # val
734                     _ = read_bytes(1)  # kind
735             if flags & 0x80 != 0:
736                 # Param names present
737                 for _ in range(param_count):
738                     _ = u30()  # param name
739             mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
740             method_infos.append(mi)
741
742         # Metadata
743         metadata_count = u30()
744         for _c in range(metadata_count):
745             _ = u30()  # name
746             item_count = u30()
747             for _c2 in range(item_count):
748                 _ = u30()  # key
749                 _ = u30()  # value
750
751         def parse_traits_info():
752             trait_name_idx = u30()
753             kind_full = read_byte()
754             kind = kind_full & 0x0f
755             attrs = kind_full >> 4
756             methods = {}
757             if kind in [0x00, 0x06]:  # Slot or Const
758                 _ = u30()  # Slot id
759                 type_name_idx = u30()
760                 vindex = u30()
761                 if vindex != 0:
762                     _ = read_byte()  # vkind
763             elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter
764                 _ = u30()  # disp_id
765                 method_idx = u30()
766                 methods[multinames[trait_name_idx]] = method_idx
767             elif kind == 0x04:  # Class
768                 _ = u30()  # slot_id
769                 _ = u30()  # classi
770             elif kind == 0x05:  # Function
771                 _ = u30()  # slot_id
772                 function_idx = u30()
773                 methods[function_idx] = multinames[trait_name_idx]
774             else:
775                 raise ExtractorError(u'Unsupported trait kind %d' % kind)
776
777             if attrs & 0x4 != 0:  # Metadata present
778                 metadata_count = u30()
779                 for _c3 in range(metadata_count):
780                     _ = u30()
781
782             return methods
783
784         # Classes
785         TARGET_CLASSNAME = u'SignatureDecipher'
786         searched_idx = multinames.index(TARGET_CLASSNAME)
787         searched_class_id = None
788         class_count = u30()
789         for class_id in range(class_count):
790             name_idx = u30()
791             if name_idx == searched_idx:
792                 # We found the class we're looking for!
793                 searched_class_id = class_id
794             _ = u30()  # super_name idx
795             flags = read_byte()
796             if flags & 0x08 != 0:  # Protected namespace is present
797                 protected_ns_idx = u30()
798             intrf_count = u30()
799             for _c2 in range(intrf_count):
800                 _ = u30()
801             _ = u30()  # iinit
802             trait_count = u30()
803             for _c2 in range(trait_count):
804                 _ = parse_traits_info()
805
806         if searched_class_id is None:
807             raise ExtractorError(u'Target class %r not found' %
808                                  TARGET_CLASSNAME)
809
810         method_names = {}
811         method_idxs = {}
812         for class_id in range(class_count):
813             _ = u30()  # cinit
814             trait_count = u30()
815             for _c2 in range(trait_count):
816                 trait_methods = parse_traits_info()
817                 if class_id == searched_class_id:
818                     method_names.update(trait_methods.items())
819                     method_idxs.update(dict(
820                         (idx, name)
821                         for name, idx in trait_methods.items()))
822
823         # Scripts
824         script_count = u30()
825         for _c in range(script_count):
826             _ = u30()  # init
827             trait_count = u30()
828             for _c2 in range(trait_count):
829                 _ = parse_traits_info()
830
831         # Method bodies
832         method_body_count = u30()
833         Method = collections.namedtuple('Method', ['code', 'local_count'])
834         methods = {}
835         for _c in range(method_body_count):
836             method_idx = u30()
837             max_stack = u30()
838             local_count = u30()
839             init_scope_depth = u30()
840             max_scope_depth = u30()
841             code_length = u30()
842             code = read_bytes(code_length)
843             if method_idx in method_idxs:
844                 m = Method(code, local_count)
845                 methods[method_idxs[method_idx]] = m
846             exception_count = u30()
847             for _c2 in range(exception_count):
848                 _ = u30()  # from
849                 _ = u30()  # to
850                 _ = u30()  # target
851                 _ = u30()  # exc_type
852                 _ = u30()  # var_name
853             trait_count = u30()
854             for _c2 in range(trait_count):
855                 _ = parse_traits_info()
856
857         assert p + code_reader.tell() == len(code_tag)
858         assert len(methods) == len(method_idxs)
859
860         method_pyfunctions = {}
861
862         def extract_function(func_name):
863             if func_name in method_pyfunctions:
864                 return method_pyfunctions[func_name]
865             if func_name not in methods:
866                 raise ExtractorError(u'Cannot find function %r' % func_name)
867             m = methods[func_name]
868
869             def resfunc(args):
870                 registers = ['(this)'] + list(args) + [None] * m.local_count
871                 stack = []
872                 coder = io.BytesIO(m.code)
873                 while True:
874                     opcode = struct.unpack('!B', coder.read(1))[0]
875                     if opcode == 36:  # pushbyte
876                         v = struct.unpack('!B', coder.read(1))[0]
877                         stack.append(v)
878                     elif opcode == 44:  # pushstring
879                         idx = u30(coder)
880                         stack.append(constant_strings[idx])
881                     elif opcode == 48:  # pushscope
882                         # We don't implement the scope register, so we'll just
883                         # ignore the popped value
884                         stack.pop()
885                     elif opcode == 70:  # callproperty
886                         index = u30(coder)
887                         mname = multinames[index]
888                         arg_count = u30(coder)
889                         args = list(reversed(
890                             [stack.pop() for _ in range(arg_count)]))
891                         obj = stack.pop()
892                         if mname == u'split':
893                             assert len(args) == 1
894                             assert isinstance(args[0], compat_str)
895                             assert isinstance(obj, compat_str)
896                             if args[0] == u'':
897                                 res = list(obj)
898                             else:
899                                 res = obj.split(args[0])
900                             stack.append(res)
901                         elif mname == u'slice':
902                             assert len(args) == 1
903                             assert isinstance(args[0], int)
904                             assert isinstance(obj, list)
905                             res = obj[args[0]:]
906                             stack.append(res)
907                         elif mname == u'join':
908                             assert len(args) == 1
909                             assert isinstance(args[0], compat_str)
910                             assert isinstance(obj, list)
911                             res = args[0].join(obj)
912                             stack.append(res)
913                         elif mname in method_pyfunctions:
914                             stack.append(method_pyfunctions[mname](args))
915                         else:
916                             raise NotImplementedError(
917                                 u'Unsupported property %r on %r'
918                                 % (mname, obj))
919                     elif opcode == 72:  # returnvalue
920                         res = stack.pop()
921                         return res
922                     elif opcode == 79:  # callpropvoid
923                         index = u30(coder)
924                         mname = multinames[index]
925                         arg_count = u30(coder)
926                         args = list(reversed(
927                             [stack.pop() for _ in range(arg_count)]))
928                         obj = stack.pop()
929                         if mname == u'reverse':
930                             assert isinstance(obj, list)
931                             obj.reverse()
932                         else:
933                             raise NotImplementedError(
934                                 u'Unsupported (void) property %r on %r'
935                                 % (mname, obj))
936                     elif opcode == 93:  # findpropstrict
937                         index = u30(coder)
938                         mname = multinames[index]
939                         res = extract_function(mname)
940                         stack.append(res)
941                     elif opcode == 97:  # setproperty
942                         index = u30(coder)
943                         value = stack.pop()
944                         idx = stack.pop()
945                         obj = stack.pop()
946                         assert isinstance(obj, list)
947                         assert isinstance(idx, int)
948                         obj[idx] = value
949                     elif opcode == 98:  # getlocal
950                         index = u30(coder)
951                         stack.append(registers[index])
952                     elif opcode == 99:  # setlocal
953                         index = u30(coder)
954                         value = stack.pop()
955                         registers[index] = value
956                     elif opcode == 102:  # getproperty
957                         index = u30(coder)
958                         pname = multinames[index]
959                         if pname == u'length':
960                             obj = stack.pop()
961                             assert isinstance(obj, list)
962                             stack.append(len(obj))
963                         else:  # Assume attribute access
964                             idx = stack.pop()
965                             assert isinstance(idx, int)
966                             obj = stack.pop()
967                             assert isinstance(obj, list)
968                             stack.append(obj[idx])
969                     elif opcode == 128:  # coerce
970                         _ = u30(coder)
971                     elif opcode == 133:  # coerce_s
972                         assert isinstance(stack[-1], (type(None), compat_str))
973                     elif opcode == 164:  # modulo
974                         value2 = stack.pop()
975                         value1 = stack.pop()
976                         res = value1 % value2
977                         stack.append(res)
978                     elif opcode == 208:  # getlocal_0
979                         stack.append(registers[0])
980                     elif opcode == 209:  # getlocal_1
981                         stack.append(registers[1])
982                     elif opcode == 210:  # getlocal_2
983                         stack.append(registers[2])
984                     elif opcode == 211:  # getlocal_3
985                         stack.append(registers[3])
986                     elif opcode == 214:  # setlocal_2
987                         registers[2] = stack.pop()
988                     elif opcode == 215:  # setlocal_3
989                         registers[3] = stack.pop()
990                     else:
991                         raise NotImplementedError(
992                             u'Unsupported opcode %d' % opcode)
993
994             method_pyfunctions[func_name] = resfunc
995             return resfunc
996
997         initial_function = extract_function(u'decipher')
998         return lambda s: initial_function([s])
999
1000     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1001         """Turn the encrypted s field into a working signature"""
1002
1003         if player_url is not None:
1004             try:
1005                 if player_url not in self._player_cache:
1006                     func = self._extract_signature_function(
1007                         video_id, player_url, len(s)
1008                     )
1009                     self._player_cache[player_url] = func
1010                 return self._player_cache[player_url](s)
1011             except Exception as e:
1012                 tb = traceback.format_exc()
1013                 self._downloader.report_warning(
1014                     u'Automatic signature extraction failed: ' + tb)
1015
1016         self._downloader.report_warning(
1017             u'Warning: Falling back to static signature algorithm')
1018         return self._static_decrypt_signature(
1019             s, video_id, player_url, age_gate)
1020
1021     def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1022         if age_gate:
1023             # The videos with age protection use another player, so the
1024             # algorithms can be different.
1025             if len(s) == 86:
1026                 return s[2:63] + s[82] + s[64:82] + s[63]
1027
1028         if len(s) == 92:
1029             return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1030         elif len(s) == 90:
1031             return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1032         elif len(s) == 89:
1033             return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1034         elif len(s) == 88:
1035             return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1036         elif len(s) == 87:
1037             return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1038         elif len(s) == 86:
1039             return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1040         elif len(s) == 85:
1041             return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1042         elif len(s) == 84:
1043             return s[81:36:-1] + s[0] + s[35:2:-1]
1044         elif len(s) == 83:
1045             return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1046         elif len(s) == 82:
1047             return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1048         elif len(s) == 81:
1049             return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1050         elif len(s) == 80:
1051             return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1052         elif len(s) == 79:
1053             return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1054
1055         else:
1056             raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1057
1058     def _decrypt_signature_age_gate(self, s):
1059         # The videos with age protection use another player, so the algorithms
1060         # can be different.
1061         if len(s) == 86:
1062             return s[2:63] + s[82] + s[64:82] + s[63]
1063         else:
1064             # Fallback to the other algortihms
1065             return self._decrypt_signature(s)
1066
1067     def _get_available_subtitles(self, video_id):
1068         try:
1069             sub_list = self._download_webpage(
1070                 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1071                 video_id, note=False)
1072         except ExtractorError as err:
1073             self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1074             return {}
1075         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1076
1077         sub_lang_list = {}
1078         for l in lang_list:
1079             lang = l[1]
1080             params = compat_urllib_parse.urlencode({
1081                 'lang': lang,
1082                 'v': video_id,
1083                 'fmt': self._downloader.params.get('subtitlesformat'),
1084             })
1085             url = u'http://www.youtube.com/api/timedtext?' + params
1086             sub_lang_list[lang] = url
1087         if not sub_lang_list:
1088             self._downloader.report_warning(u'video doesn\'t have subtitles')
1089             return {}
1090         return sub_lang_list
1091
1092     def _get_available_automatic_caption(self, video_id, webpage):
1093         """We need the webpage for getting the captions url, pass it as an
1094            argument to speed up the process."""
1095         sub_format = self._downloader.params.get('subtitlesformat')
1096         self.to_screen(u'%s: Looking for automatic captions' % video_id)
1097         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1098         err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1099         if mobj is None:
1100             self._downloader.report_warning(err_msg)
1101             return {}
1102         player_config = json.loads(mobj.group(1))
1103         try:
1104             args = player_config[u'args']
1105             caption_url = args[u'ttsurl']
1106             timestamp = args[u'timestamp']
1107             # We get the available subtitles
1108             list_params = compat_urllib_parse.urlencode({
1109                 'type': 'list',
1110                 'tlangs': 1,
1111                 'asrs': 1,
1112             })
1113             list_url = caption_url + '&' + list_params
1114             list_page = self._download_webpage(list_url, video_id)
1115             caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1116             original_lang_node = caption_list.find('track')
1117             if original_lang_node.attrib.get('kind') != 'asr' :
1118                 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1119                 return {}
1120             original_lang = original_lang_node.attrib['lang_code']
1121
1122             sub_lang_list = {}
1123             for lang_node in caption_list.findall('target'):
1124                 sub_lang = lang_node.attrib['lang_code']
1125                 params = compat_urllib_parse.urlencode({
1126                     'lang': original_lang,
1127                     'tlang': sub_lang,
1128                     'fmt': sub_format,
1129                     'ts': timestamp,
1130                     'kind': 'asr',
1131                 })
1132                 sub_lang_list[sub_lang] = caption_url + '&' + params
1133             return sub_lang_list
1134         # An extractor error can be raise by the download process if there are
1135         # no automatic captions but there are subtitles
1136         except (KeyError, ExtractorError):
1137             self._downloader.report_warning(err_msg)
1138             return {}
1139
1140     def _print_formats(self, formats):
1141         print('Available formats:')
1142         for x in formats:
1143             print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1144                                         self._video_dimensions.get(x, '???'),
1145                                         ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1146
1147     def _extract_id(self, url):
1148         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1149         if mobj is None:
1150             raise ExtractorError(u'Invalid URL: %s' % url)
1151         video_id = mobj.group(2)
1152         return video_id
1153
1154     def _get_video_url_list(self, url_map):
1155         """
1156         Transform a dictionary in the format {itag:url} to a list of (itag, url)
1157         with the requested formats.
1158         """
1159         req_format = self._downloader.params.get('format', None)
1160         format_limit = self._downloader.params.get('format_limit', None)
1161         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1162         if format_limit is not None and format_limit in available_formats:
1163             format_list = available_formats[available_formats.index(format_limit):]
1164         else:
1165             format_list = available_formats
1166         existing_formats = [x for x in format_list if x in url_map]
1167         if len(existing_formats) == 0:
1168             raise ExtractorError(u'no known formats available for video')
1169         if self._downloader.params.get('listformats', None):
1170             self._print_formats(existing_formats)
1171             return
1172         if req_format is None or req_format == 'best':
1173             video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1174         elif req_format == 'worst':
1175             video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1176         elif req_format in ('-1', 'all'):
1177             video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1178         else:
1179             # Specific formats. We pick the first in a slash-delimeted sequence.
1180             # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1181             # available in the specified format. For example,
1182             # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1183             # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1184             # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1185             req_formats = req_format.split('/')
1186             video_url_list = None
1187             for rf in req_formats:
1188                 if rf in url_map:
1189                     video_url_list = [(rf, url_map[rf])]
1190                     break
1191                 if rf in self._video_formats_map:
1192                     for srf in self._video_formats_map[rf]:
1193                         if srf in url_map:
1194                             video_url_list = [(srf, url_map[srf])]
1195                             break
1196                     else:
1197                         continue
1198                     break
1199             if video_url_list is None:
1200                 raise ExtractorError(u'requested format not available')
1201         return video_url_list
1202
1203     def _extract_from_m3u8(self, manifest_url, video_id):
1204         url_map = {}
1205         def _get_urls(_manifest):
1206             lines = _manifest.split('\n')
1207             urls = filter(lambda l: l and not l.startswith('#'),
1208                             lines)
1209             return urls
1210         manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1211         formats_urls = _get_urls(manifest)
1212         for format_url in formats_urls:
1213             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1214             url_map[itag] = format_url
1215         return url_map
1216
1217     def _real_extract(self, url):
1218         if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1219             self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like  youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply  youtube-dl BaW_jenozKc  ).')
1220
1221         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1222         mobj = re.search(self._NEXT_URL_RE, url)
1223         if mobj:
1224             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1225         video_id = self._extract_id(url)
1226
1227         # Get video webpage
1228         self.report_video_webpage_download(video_id)
1229         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1230         request = compat_urllib_request.Request(url)
1231         try:
1232             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1233         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1234             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1235
1236         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1237
1238         # Attempt to extract SWF player URL
1239         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1240         if mobj is not None:
1241             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1242         else:
1243             player_url = None
1244
1245         # Get video info
1246         self.report_video_info_webpage_download(video_id)
1247         if re.search(r'player-age-gate-content">', video_webpage) is not None:
1248             self.report_age_confirmation()
1249             age_gate = True
1250             # We simulate the access to the video from www.youtube.com/v/{video_id}
1251             # this can be viewed without login into Youtube
1252             data = compat_urllib_parse.urlencode({'video_id': video_id,
1253                                                   'el': 'embedded',
1254                                                   'gl': 'US',
1255                                                   'hl': 'en',
1256                                                   'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1257                                                   'asv': 3,
1258                                                   'sts':'1588',
1259                                                   })
1260             video_info_url = 'https://www.youtube.com/get_video_info?' + data
1261             video_info_webpage = self._download_webpage(video_info_url, video_id,
1262                                     note=False,
1263                                     errnote='unable to download video info webpage')
1264             video_info = compat_parse_qs(video_info_webpage)
1265         else:
1266             age_gate = False
1267             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1268                 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1269                         % (video_id, el_type))
1270                 video_info_webpage = self._download_webpage(video_info_url, video_id,
1271                                         note=False,
1272                                         errnote='unable to download video info webpage')
1273                 video_info = compat_parse_qs(video_info_webpage)
1274                 if 'token' in video_info:
1275                     break
1276         if 'token' not in video_info:
1277             if 'reason' in video_info:
1278                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1279             else:
1280                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1281
1282         # Check for "rental" videos
1283         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1284             raise ExtractorError(u'"rental" videos not supported')
1285
1286         # Start extracting information
1287         self.report_information_extraction(video_id)
1288
1289         # uploader
1290         if 'author' not in video_info:
1291             raise ExtractorError(u'Unable to extract uploader name')
1292         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1293
1294         # uploader_id
1295         video_uploader_id = None
1296         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1297         if mobj is not None:
1298             video_uploader_id = mobj.group(1)
1299         else:
1300             self._downloader.report_warning(u'unable to extract uploader nickname')
1301
1302         # title
1303         if 'title' not in video_info:
1304             raise ExtractorError(u'Unable to extract video title')
1305         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1306
1307         # thumbnail image
1308         # We try first to get a high quality image:
1309         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1310                             video_webpage, re.DOTALL)
1311         if m_thumb is not None:
1312             video_thumbnail = m_thumb.group(1)
1313         elif 'thumbnail_url' not in video_info:
1314             self._downloader.report_warning(u'unable to extract video thumbnail')
1315             video_thumbnail = ''
1316         else:   # don't panic if we can't find it
1317             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1318
1319         # upload date
1320         upload_date = None
1321         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1322         if mobj is not None:
1323             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1324             upload_date = unified_strdate(upload_date)
1325
1326         # description
1327         video_description = get_element_by_id("eow-description", video_webpage)
1328         if video_description:
1329             video_description = clean_html(video_description)
1330         else:
1331             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1332             if fd_mobj:
1333                 video_description = unescapeHTML(fd_mobj.group(1))
1334             else:
1335                 video_description = u''
1336
1337         # subtitles
1338         video_subtitles = self.extract_subtitles(video_id, video_webpage)
1339
1340         if self._downloader.params.get('listsubtitles', False):
1341             self._list_available_subtitles(video_id, video_webpage)
1342             return
1343
1344         if 'length_seconds' not in video_info:
1345             self._downloader.report_warning(u'unable to extract video duration')
1346             video_duration = ''
1347         else:
1348             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1349
1350         # Decide which formats to download
1351
1352         try:
1353             mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1354             if not mobj:
1355                 raise ValueError('Could not find vevo ID')
1356             info = json.loads(mobj.group(1))
1357             args = info['args']
1358             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1359             # this signatures are encrypted
1360             m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1361             if m_s is not None:
1362                 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1363                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1364             m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1365             if m_s is not None:
1366                 if 'url_encoded_fmt_stream_map' in video_info:
1367                     video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1368                 else:
1369                     video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1370             elif 'adaptive_fmts' in video_info:
1371                 if 'url_encoded_fmt_stream_map' in video_info:
1372                     video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1373                 else:
1374                     video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1375         except ValueError:
1376             pass
1377
1378         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1379             self.report_rtmp_download()
1380             video_url_list = [(None, video_info['conn'][0])]
1381         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1382             if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1383                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1384             url_map = {}
1385             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1386                 url_data = compat_parse_qs(url_data_str)
1387                 if 'itag' in url_data and 'url' in url_data:
1388                     url = url_data['url'][0]
1389                     if 'sig' in url_data:
1390                         url += '&signature=' + url_data['sig'][0]
1391                     elif 's' in url_data:
1392                         encrypted_sig = url_data['s'][0]
1393                         if self._downloader.params.get('verbose'):
1394                             if age_gate:
1395                                 player_version = self._search_regex(
1396                                     r'-(.+)\.swf$',
1397                                     player_url if player_url else None,
1398                                     'flash player', fatal=False)
1399                                 player_desc = 'flash player %s' % player_version
1400                             else:
1401                                 player_version = self._search_regex(
1402                                     r'html5player-(.+?)\.js', video_webpage,
1403                                     'html5 player', fatal=False)
1404                                 player_desc = u'html5 player %s' % player_version
1405
1406                             parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1407                             self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1408                                 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1409
1410                         if not age_gate:
1411                             jsplayer_url_json = self._search_regex(
1412                                 r'"assets":.+?"js":\s*("[^"]+")',
1413                                 video_webpage, u'JS player URL')
1414                             player_url = json.loads(jsplayer_url_json)
1415
1416                         signature = self._decrypt_signature(
1417                             encrypted_sig, video_id, player_url, age_gate)
1418                         url += '&signature=' + signature
1419                     if 'ratebypass' not in url:
1420                         url += '&ratebypass=yes'
1421                     url_map[url_data['itag'][0]] = url
1422             video_url_list = self._get_video_url_list(url_map)
1423             if not video_url_list:
1424                 return
1425         elif video_info.get('hlsvp'):
1426             manifest_url = video_info['hlsvp'][0]
1427             url_map = self._extract_from_m3u8(manifest_url, video_id)
1428             video_url_list = self._get_video_url_list(url_map)
1429             if not video_url_list:
1430                 return
1431
1432         else:
1433             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1434
1435         results = []
1436         for format_param, video_real_url in video_url_list:
1437             # Extension
1438             video_extension = self._video_extensions.get(format_param, 'flv')
1439
1440             video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1441                                               self._video_dimensions.get(format_param, '???'),
1442                                               ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1443
1444             results.append({
1445                 'id':       video_id,
1446                 'url':      video_real_url,
1447                 'uploader': video_uploader,
1448                 'uploader_id': video_uploader_id,
1449                 'upload_date':  upload_date,
1450                 'title':    video_title,
1451                 'ext':      video_extension,
1452                 'format':   video_format,
1453                 'thumbnail':    video_thumbnail,
1454                 'description':  video_description,
1455                 'player_url':   player_url,
1456                 'subtitles':    video_subtitles,
1457                 'duration':     video_duration
1458             })
1459         return results
1460
1461 class YoutubePlaylistIE(InfoExtractor):
1462     IE_DESC = u'YouTube.com playlists'
1463     _VALID_URL = r"""(?:
1464                         (?:https?://)?
1465                         (?:\w+\.)?
1466                         youtube\.com/
1467                         (?:
1468                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1469                            \? (?:.*?&)*? (?:p|a|list)=
1470                         |  p/
1471                         )
1472                         ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1473                         .*
1474                      |
1475                         ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1476                      )"""
1477     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1478     _MAX_RESULTS = 50
1479     IE_NAME = u'youtube:playlist'
1480
1481     @classmethod
1482     def suitable(cls, url):
1483         """Receives a URL and returns True if suitable for this IE."""
1484         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1485
1486     def _real_extract(self, url):
1487         # Extract playlist id
1488         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1489         if mobj is None:
1490             raise ExtractorError(u'Invalid URL: %s' % url)
1491
1492         # Download playlist videos from API
1493         playlist_id = mobj.group(1) or mobj.group(2)
1494         videos = []
1495
1496         for page_num in itertools.count(1):
1497             start_index = self._MAX_RESULTS * (page_num - 1) + 1
1498             if start_index >= 1000:
1499                 self._downloader.report_warning(u'Max number of results reached')
1500                 break
1501             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1502             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1503
1504             try:
1505                 response = json.loads(page)
1506             except ValueError as err:
1507                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1508
1509             if 'feed' not in response:
1510                 raise ExtractorError(u'Got a malformed response from YouTube API')
1511             playlist_title = response['feed']['title']['$t']
1512             if 'entry' not in response['feed']:
1513                 # Number of videos is a multiple of self._MAX_RESULTS
1514                 break
1515
1516             for entry in response['feed']['entry']:
1517                 index = entry['yt$position']['$t']
1518                 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1519                     videos.append((
1520                         index,
1521                         'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1522                     ))
1523
1524         videos = [v[1] for v in sorted(videos)]
1525
1526         url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1527         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1528
1529
1530 class YoutubeChannelIE(InfoExtractor):
1531     IE_DESC = u'YouTube.com channels'
1532     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1533     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1534     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1535     _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1536     IE_NAME = u'youtube:channel'
1537
1538     def extract_videos_from_page(self, page):
1539         ids_in_page = []
1540         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1541             if mobj.group(1) not in ids_in_page:
1542                 ids_in_page.append(mobj.group(1))
1543         return ids_in_page
1544
1545     def _real_extract(self, url):
1546         # Extract channel id
1547         mobj = re.match(self._VALID_URL, url)
1548         if mobj is None:
1549             raise ExtractorError(u'Invalid URL: %s' % url)
1550
1551         # Download channel page
1552         channel_id = mobj.group(1)
1553         video_ids = []
1554         pagenum = 1
1555
1556         url = self._TEMPLATE_URL % (channel_id, pagenum)
1557         page = self._download_webpage(url, channel_id,
1558                                       u'Downloading page #%s' % pagenum)
1559
1560         # Extract video identifiers
1561         ids_in_page = self.extract_videos_from_page(page)
1562         video_ids.extend(ids_in_page)
1563
1564         # Download any subsequent channel pages using the json-based channel_ajax query
1565         if self._MORE_PAGES_INDICATOR in page:
1566             for pagenum in itertools.count(1):
1567                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1568                 page = self._download_webpage(url, channel_id,
1569                                               u'Downloading page #%s' % pagenum)
1570
1571                 page = json.loads(page)
1572
1573                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1574                 video_ids.extend(ids_in_page)
1575
1576                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1577                     break
1578
1579         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1580
1581         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1582         url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1583         return [self.playlist_result(url_entries, channel_id)]
1584
1585
1586 class YoutubeUserIE(InfoExtractor):
1587     IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1588     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1589     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1590     _GDATA_PAGE_SIZE = 50
1591     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1592     IE_NAME = u'youtube:user'
1593
1594     @classmethod
1595     def suitable(cls, url):
1596         # Don't return True if the url can be extracted with other youtube
1597         # extractor, the regex would is too permissive and it would match.
1598         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1599         if any(ie.suitable(url) for ie in other_ies): return False
1600         else: return super(YoutubeUserIE, cls).suitable(url)
1601
1602     def _real_extract(self, url):
1603         # Extract username
1604         mobj = re.match(self._VALID_URL, url)
1605         if mobj is None:
1606             raise ExtractorError(u'Invalid URL: %s' % url)
1607
1608         username = mobj.group(1)
1609
1610         # Download video ids using YouTube Data API. Result size per
1611         # query is limited (currently to 50 videos) so we need to query
1612         # page by page until there are no video ids - it means we got
1613         # all of them.
1614
1615         video_ids = []
1616
1617         for pagenum in itertools.count(0):
1618             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1619
1620             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1621             page = self._download_webpage(gdata_url, username,
1622                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1623
1624             try:
1625                 response = json.loads(page)
1626             except ValueError as err:
1627                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1628             if 'entry' not in response['feed']:
1629                 # Number of videos is a multiple of self._MAX_RESULTS
1630                 break
1631
1632             # Extract video identifiers
1633             ids_in_page = []
1634             for entry in response['feed']['entry']:
1635                 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1636             video_ids.extend(ids_in_page)
1637
1638             # A little optimization - if current page is not
1639             # "full", ie. does not contain PAGE_SIZE video ids then
1640             # we can assume that this page is the last one - there
1641             # are no more ids on further pages - no need to query
1642             # again.
1643
1644             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1645                 break
1646
1647         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1648         url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1649         return [self.playlist_result(url_results, playlist_title = username)]
1650
1651 class YoutubeSearchIE(SearchInfoExtractor):
1652     IE_DESC = u'YouTube.com searches'
1653     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1654     _MAX_RESULTS = 1000
1655     IE_NAME = u'youtube:search'
1656     _SEARCH_KEY = 'ytsearch'
1657
1658     def report_download_page(self, query, pagenum):
1659         """Report attempt to download search page with given number."""
1660         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1661
1662     def _get_n_results(self, query, n):
1663         """Get a specified number of results for a query"""
1664
1665         video_ids = []
1666         pagenum = 0
1667         limit = n
1668
1669         while (50 * pagenum) < limit:
1670             self.report_download_page(query, pagenum+1)
1671             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1672             request = compat_urllib_request.Request(result_url)
1673             try:
1674                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1675             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1676                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1677             api_response = json.loads(data)['data']
1678
1679             if not 'items' in api_response:
1680                 raise ExtractorError(u'[youtube] No video results')
1681
1682             new_ids = list(video['id'] for video in api_response['items'])
1683             video_ids += new_ids
1684
1685             limit = min(n, api_response['totalItems'])
1686             pagenum += 1
1687
1688         if len(video_ids) > n:
1689             video_ids = video_ids[:n]
1690         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1691         return self.playlist_result(videos, query)
1692
1693
1694 class YoutubeShowIE(InfoExtractor):
1695     IE_DESC = u'YouTube.com (multi-season) shows'
1696     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1697     IE_NAME = u'youtube:show'
1698
1699     def _real_extract(self, url):
1700         mobj = re.match(self._VALID_URL, url)
1701         show_name = mobj.group(1)
1702         webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1703         # There's one playlist for each season of the show
1704         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1705         self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1706         return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1707
1708
1709 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1710     """
1711     Base class for extractors that fetch info from
1712     http://www.youtube.com/feed_ajax
1713     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1714     """
1715     _LOGIN_REQUIRED = True
1716     _PAGING_STEP = 30
1717     # use action_load_personal_feed instead of action_load_system_feed
1718     _PERSONAL_FEED = False
1719
1720     @property
1721     def _FEED_TEMPLATE(self):
1722         action = 'action_load_system_feed'
1723         if self._PERSONAL_FEED:
1724             action = 'action_load_personal_feed'
1725         return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1726
1727     @property
1728     def IE_NAME(self):
1729         return u'youtube:%s' % self._FEED_NAME
1730
1731     def _real_initialize(self):
1732         self._login()
1733
1734     def _real_extract(self, url):
1735         feed_entries = []
1736         # The step argument is available only in 2.7 or higher
1737         for i in itertools.count(0):
1738             paging = i*self._PAGING_STEP
1739             info = self._download_webpage(self._FEED_TEMPLATE % paging,
1740                                           u'%s feed' % self._FEED_NAME,
1741                                           u'Downloading page %s' % i)
1742             info = json.loads(info)
1743             feed_html = info['feed_html']
1744             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1745             ids = orderedSet(m.group(1) for m in m_ids)
1746             feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1747             if info['paging'] is None:
1748                 break
1749         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1750
1751 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1752     IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1753     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1754     _FEED_NAME = 'subscriptions'
1755     _PLAYLIST_TITLE = u'Youtube Subscriptions'
1756
1757 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1758     IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1759     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1760     _FEED_NAME = 'recommended'
1761     _PLAYLIST_TITLE = u'Youtube Recommended videos'
1762
1763 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1764     IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1765     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1766     _FEED_NAME = 'watch_later'
1767     _PLAYLIST_TITLE = u'Youtube Watch Later'
1768     _PAGING_STEP = 100
1769     _PERSONAL_FEED = True
1770
1771 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1772     IE_NAME = u'youtube:favorites'
1773     IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1774     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1775     _LOGIN_REQUIRED = True
1776
1777     def _real_extract(self, url):
1778         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1779         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1780         return self.url_result(playlist_id, 'YoutubePlaylist')