Merge pull request #3180 from hakatashi/niconico-without-authentication
[youtube-dl] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 import collections
4 import errno
5 import io
6 import itertools
7 import json
8 import os.path
9 import re
10 import struct
11 import traceback
12 import zlib
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from .subtitles import SubtitlesInfoExtractor
16 from ..jsinterp import JSInterpreter
17 from ..utils import (
18     compat_chr,
19     compat_parse_qs,
20     compat_urllib_parse,
21     compat_urllib_request,
22     compat_urlparse,
23     compat_str,
24
25     clean_html,
26     get_cachedir,
27     get_element_by_id,
28     get_element_by_attribute,
29     ExtractorError,
30     int_or_none,
31     PagedList,
32     unescapeHTML,
33     unified_strdate,
34     orderedSet,
35     write_json_file,
36     uppercase_escape,
37 )
38
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40     """Provide base functions for Youtube extractors"""
41     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44     _NETRC_MACHINE = 'youtube'
45     # If True it will raise an error if no login info is provided
46     _LOGIN_REQUIRED = False
47
48     def _set_language(self):
49         return bool(self._download_webpage(
50             self._LANG_URL, None,
51             note=u'Setting language', errnote='unable to set language',
52             fatal=False))
53
54     def _login(self):
55         (username, password) = self._get_login_info()
56         # No authentication to be performed
57         if username is None:
58             if self._LOGIN_REQUIRED:
59                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60             return False
61
62         login_page = self._download_webpage(
63             self._LOGIN_URL, None,
64             note=u'Downloading login page',
65             errnote=u'unable to fetch login page', fatal=False)
66         if login_page is False:
67             return
68
69         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70                                   login_page, u'Login GALX parameter')
71
72         # Log in
73         login_form_strs = {
74                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75                 u'Email': username,
76                 u'GALX': galx,
77                 u'Passwd': password,
78                 u'PersistentCookie': u'yes',
79                 u'_utf8': u'霱',
80                 u'bgresponse': u'js_disabled',
81                 u'checkConnection': u'',
82                 u'checkedDomains': u'youtube',
83                 u'dnConn': u'',
84                 u'pstMsg': u'0',
85                 u'rmShown': u'1',
86                 u'secTok': u'',
87                 u'signIn': u'Sign in',
88                 u'timeStmp': u'',
89                 u'service': u'youtube',
90                 u'uilel': u'3',
91                 u'hl': u'en_US',
92         }
93         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94         # chokes on unicode
95         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
97
98         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99         login_results = self._download_webpage(
100             req, None,
101             note=u'Logging in', errnote=u'unable to log in', fatal=False)
102         if login_results is False:
103             return False
104         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105             self._downloader.report_warning(u'unable to log in: bad username or password')
106             return False
107         return True
108
109     def _confirm_age(self):
110         age_form = {
111             'next_url': '/',
112             'action_confirm': 'Confirm',
113         }
114         req = compat_urllib_request.Request(self._AGE_URL,
115             compat_urllib_parse.urlencode(age_form).encode('ascii'))
116
117         self._download_webpage(
118             req, None,
119             note=u'Confirming age', errnote=u'Unable to confirm age')
120         return True
121
122     def _real_initialize(self):
123         if self._downloader is None:
124             return
125         if not self._set_language():
126             return
127         if not self._login():
128             return
129         self._confirm_age()
130
131
132 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
133     IE_DESC = u'YouTube.com'
134     _VALID_URL = r"""(?x)^
135                      (
136                          (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
137                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
138                             (?:www\.)?deturl\.com/www\.youtube\.com/|
139                             (?:www\.)?pwnyoutube\.com/|
140                             (?:www\.)?yourepeat\.com/|
141                             tube\.majestyc\.net/|
142                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
143                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
144                          (?:                                                  # the various things that can precede the ID:
145                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
146                              |(?:                                             # or the v= param in all its forms
147                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
148                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
149                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
150                                  v=
151                              )
152                          ))
153                          |youtu\.be/                                          # just youtu.be/xxxx
154                          |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
155                          )
156                      )?                                                       # all until now is optional -> you can pass the naked ID
157                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
158                      (?(1).+)?                                                # if we found the ID, everything can follow
159                      $"""
160     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
161     _formats = {
162         '5': {'ext': 'flv', 'width': 400, 'height': 240},
163         '6': {'ext': 'flv', 'width': 450, 'height': 270},
164         '13': {'ext': '3gp'},
165         '17': {'ext': '3gp', 'width': 176, 'height': 144},
166         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
167         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
168         '34': {'ext': 'flv', 'width': 640, 'height': 360},
169         '35': {'ext': 'flv', 'width': 854, 'height': 480},
170         '36': {'ext': '3gp', 'width': 320, 'height': 240},
171         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
172         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
173         '43': {'ext': 'webm', 'width': 640, 'height': 360},
174         '44': {'ext': 'webm', 'width': 854, 'height': 480},
175         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
176         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
177
178
179         # 3d videos
180         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
181         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
182         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
183         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
184         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
185         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
186         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
187
188         # Apple HTTP Live Streaming
189         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
190         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
191         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
192         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
193         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
194         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
195         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
196
197         # DASH mp4 video
198         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
205         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
206
207         # Dash mp4 audio
208         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
209         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
210         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
211
212         # Dash webm
213         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
214         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
215         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
216         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
217         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
218         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
219         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
220         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
221         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
222         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
223         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
224         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
225         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
226         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
227         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
228
229         # Dash webm audio
230         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
231         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
232
233         # RTMP (unnamed)
234         '_rtmp': {'protocol': 'rtmp'},
235     }
236
237     IE_NAME = u'youtube'
238     _TESTS = [
239         {
240             u"url":  u"http://www.youtube.com/watch?v=BaW_jenozKc",
241             u"file":  u"BaW_jenozKc.mp4",
242             u"info_dict": {
243                 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
244                 u"uploader": u"Philipp Hagemeister",
245                 u"uploader_id": u"phihag",
246                 u"upload_date": u"20121002",
247                 u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
248                 u"categories": [u'Science & Technology'],
249             }
250         },
251         {
252             u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
253             u"file":  u"UxxajLWwzqY.mp4",
254             u"note": u"Test generic use_cipher_signature video (#897)",
255             u"info_dict": {
256                 u"upload_date": u"20120506",
257                 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
258                 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
259                 u"uploader": u"Icona Pop",
260                 u"uploader_id": u"IconaPop"
261             }
262         },
263         {
264             u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
265             u"file":  u"07FYdnEawAQ.mp4",
266             u"note": u"Test VEVO video with age protection (#956)",
267             u"info_dict": {
268                 u"upload_date": u"20130703",
269                 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
270                 u"description": u"md5:64249768eec3bc4276236606ea996373",
271                 u"uploader": u"justintimberlakeVEVO",
272                 u"uploader_id": u"justintimberlakeVEVO"
273             }
274         },
275         {
276             u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
277             u"file":  u"yZIXLfi8CZQ.mp4",
278             u"note": u"Embed-only video (#1746)",
279             u"info_dict": {
280                 u"upload_date": u"20120608",
281                 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
282                 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
283                 u"uploader": u"SET India",
284                 u"uploader_id": u"setindia"
285             }
286         },
287         {
288             u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
289             u"file": u"a9LDPn-MO4I.m4a",
290             u"note": u"256k DASH audio (format 141) via DASH manifest",
291             u"info_dict": {
292                 u"upload_date": "20121002",
293                 u"uploader_id": "8KVIDEO",
294                 u"description": "No description available.",
295                 u"uploader": "8KVIDEO",
296                 u"title": "UHDTV TEST 8K VIDEO.mp4"
297             },
298             u"params": {
299                 u"youtube_include_dash_manifest": True,
300                 u"format": "141",
301             },
302         },
303         # DASH manifest with encrypted signature
304         {
305             u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
306             u'info_dict': {
307                 u'id': u'IB3lcPjvWLA',
308                 u'ext': u'm4a',
309                 u'title': u'Afrojack - The Spark ft. Spree Wilson',
310                 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
311                 u'uploader': u'AfrojackVEVO',
312                 u'uploader_id': u'AfrojackVEVO',
313                 u'upload_date': u'20131011',
314             },
315             u"params": {
316                 u'youtube_include_dash_manifest': True,
317                 u'format': '141',
318             },
319         },
320     ]
321
322
323     @classmethod
324     def suitable(cls, url):
325         """Receives a URL and returns True if suitable for this IE."""
326         if YoutubePlaylistIE.suitable(url): return False
327         return re.match(cls._VALID_URL, url) is not None
328
329     def __init__(self, *args, **kwargs):
330         super(YoutubeIE, self).__init__(*args, **kwargs)
331         self._player_cache = {}
332
333     def report_video_info_webpage_download(self, video_id):
334         """Report attempt to download video info webpage."""
335         self.to_screen(u'%s: Downloading video info webpage' % video_id)
336
337     def report_information_extraction(self, video_id):
338         """Report attempt to extract video information."""
339         self.to_screen(u'%s: Extracting video information' % video_id)
340
341     def report_unavailable_format(self, video_id, format):
342         """Report extracted video URL."""
343         self.to_screen(u'%s: Format %s not available' % (video_id, format))
344
345     def report_rtmp_download(self):
346         """Indicate the download will use the RTMP protocol."""
347         self.to_screen(u'RTMP download detected')
348
349     def _extract_signature_function(self, video_id, player_url, slen):
350         id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
351                         player_url)
352         player_type = id_m.group('ext')
353         player_id = id_m.group('id')
354
355         # Read from filesystem cache
356         func_id = '%s_%s_%d' % (player_type, player_id, slen)
357         assert os.path.basename(func_id) == func_id
358         cache_dir = get_cachedir(self._downloader.params)
359
360         cache_enabled = cache_dir is not None
361         if cache_enabled:
362             cache_fn = os.path.join(os.path.expanduser(cache_dir),
363                                     u'youtube-sigfuncs',
364                                     func_id + '.json')
365             try:
366                 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
367                     cache_spec = json.load(cachef)
368                 return lambda s: u''.join(s[i] for i in cache_spec)
369             except IOError:
370                 pass  # No cache available
371
372         if player_type == 'js':
373             code = self._download_webpage(
374                 player_url, video_id,
375                 note=u'Downloading %s player %s' % (player_type, player_id),
376                 errnote=u'Download of %s failed' % player_url)
377             res = self._parse_sig_js(code)
378         elif player_type == 'swf':
379             urlh = self._request_webpage(
380                 player_url, video_id,
381                 note=u'Downloading %s player %s' % (player_type, player_id),
382                 errnote=u'Download of %s failed' % player_url)
383             code = urlh.read()
384             res = self._parse_sig_swf(code)
385         else:
386             assert False, 'Invalid player type %r' % player_type
387
388         if cache_enabled:
389             try:
390                 test_string = u''.join(map(compat_chr, range(slen)))
391                 cache_res = res(test_string)
392                 cache_spec = [ord(c) for c in cache_res]
393                 try:
394                     os.makedirs(os.path.dirname(cache_fn))
395                 except OSError as ose:
396                     if ose.errno != errno.EEXIST:
397                         raise
398                 write_json_file(cache_spec, cache_fn)
399             except Exception:
400                 tb = traceback.format_exc()
401                 self._downloader.report_warning(
402                     u'Writing cache to %r failed: %s' % (cache_fn, tb))
403
404         return res
405
406     def _print_sig_code(self, func, slen):
407         def gen_sig_code(idxs):
408             def _genslice(start, end, step):
409                 starts = u'' if start == 0 else str(start)
410                 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
411                 steps = u'' if step == 1 else (u':%d' % step)
412                 return u's[%s%s%s]' % (starts, ends, steps)
413
414             step = None
415             start = '(Never used)'  # Quelch pyflakes warnings - start will be
416                                     # set as soon as step is set
417             for i, prev in zip(idxs[1:], idxs[:-1]):
418                 if step is not None:
419                     if i - prev == step:
420                         continue
421                     yield _genslice(start, prev, step)
422                     step = None
423                     continue
424                 if i - prev in [-1, 1]:
425                     step = i - prev
426                     start = prev
427                     continue
428                 else:
429                     yield u's[%d]' % prev
430             if step is None:
431                 yield u's[%d]' % i
432             else:
433                 yield _genslice(start, i, step)
434
435         test_string = u''.join(map(compat_chr, range(slen)))
436         cache_res = func(test_string)
437         cache_spec = [ord(c) for c in cache_res]
438         expr_code = u' + '.join(gen_sig_code(cache_spec))
439         code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code)
440         self.to_screen(u'Extracted signature function:\n' + code)
441
442     def _parse_sig_js(self, jscode):
443         funcname = self._search_regex(
444             r'signature=([$a-zA-Z]+)', jscode,
445              u'Initial JS player signature function name')
446
447         jsi = JSInterpreter(jscode)
448         initial_function = jsi.extract_function(funcname)
449         return lambda s: initial_function([s])
450
451     def _parse_sig_swf(self, file_contents):
452         if file_contents[1:3] != b'WS':
453             raise ExtractorError(
454                 u'Not an SWF file; header is %r' % file_contents[:3])
455         if file_contents[:1] == b'C':
456             content = zlib.decompress(file_contents[8:])
457         else:
458             raise NotImplementedError(u'Unsupported compression format %r' %
459                                       file_contents[:1])
460
461         def extract_tags(content):
462             pos = 0
463             while pos < len(content):
464                 header16 = struct.unpack('<H', content[pos:pos+2])[0]
465                 pos += 2
466                 tag_code = header16 >> 6
467                 tag_len = header16 & 0x3f
468                 if tag_len == 0x3f:
469                     tag_len = struct.unpack('<I', content[pos:pos+4])[0]
470                     pos += 4
471                 assert pos+tag_len <= len(content)
472                 yield (tag_code, content[pos:pos+tag_len])
473                 pos += tag_len
474
475         code_tag = next(tag
476                         for tag_code, tag in extract_tags(content)
477                         if tag_code == 82)
478         p = code_tag.index(b'\0', 4) + 1
479         code_reader = io.BytesIO(code_tag[p:])
480
481         # Parse ABC (AVM2 ByteCode)
482         def read_int(reader=None):
483             if reader is None:
484                 reader = code_reader
485             res = 0
486             shift = 0
487             for _ in range(5):
488                 buf = reader.read(1)
489                 assert len(buf) == 1
490                 b = struct.unpack('<B', buf)[0]
491                 res = res | ((b & 0x7f) << shift)
492                 if b & 0x80 == 0:
493                     break
494                 shift += 7
495             return res
496
497         def u30(reader=None):
498             res = read_int(reader)
499             assert res & 0xf0000000 == 0
500             return res
501         u32 = read_int
502
503         def s32(reader=None):
504             v = read_int(reader)
505             if v & 0x80000000 != 0:
506                 v = - ((v ^ 0xffffffff) + 1)
507             return v
508
509         def read_string(reader=None):
510             if reader is None:
511                 reader = code_reader
512             slen = u30(reader)
513             resb = reader.read(slen)
514             assert len(resb) == slen
515             return resb.decode('utf-8')
516
517         def read_bytes(count, reader=None):
518             if reader is None:
519                 reader = code_reader
520             resb = reader.read(count)
521             assert len(resb) == count
522             return resb
523
524         def read_byte(reader=None):
525             resb = read_bytes(1, reader=reader)
526             res = struct.unpack('<B', resb)[0]
527             return res
528
529         # minor_version + major_version
530         read_bytes(2 + 2)
531
532         # Constant pool
533         int_count = u30()
534         for _c in range(1, int_count):
535             s32()
536         uint_count = u30()
537         for _c in range(1, uint_count):
538             u32()
539         double_count = u30()
540         read_bytes((double_count-1) * 8)
541         string_count = u30()
542         constant_strings = [u'']
543         for _c in range(1, string_count):
544             s = read_string()
545             constant_strings.append(s)
546         namespace_count = u30()
547         for _c in range(1, namespace_count):
548             read_bytes(1)  # kind
549             u30()  # name
550         ns_set_count = u30()
551         for _c in range(1, ns_set_count):
552             count = u30()
553             for _c2 in range(count):
554                 u30()
555         multiname_count = u30()
556         MULTINAME_SIZES = {
557             0x07: 2,  # QName
558             0x0d: 2,  # QNameA
559             0x0f: 1,  # RTQName
560             0x10: 1,  # RTQNameA
561             0x11: 0,  # RTQNameL
562             0x12: 0,  # RTQNameLA
563             0x09: 2,  # Multiname
564             0x0e: 2,  # MultinameA
565             0x1b: 1,  # MultinameL
566             0x1c: 1,  # MultinameLA
567         }
568         multinames = [u'']
569         for _c in range(1, multiname_count):
570             kind = u30()
571             assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
572             if kind == 0x07:
573                 u30()  # namespace_idx
574                 name_idx = u30()
575                 multinames.append(constant_strings[name_idx])
576             else:
577                 multinames.append('[MULTINAME kind: %d]' % kind)
578                 for _c2 in range(MULTINAME_SIZES[kind]):
579                     u30()
580
581         # Methods
582         method_count = u30()
583         MethodInfo = collections.namedtuple(
584             'MethodInfo',
585             ['NEED_ARGUMENTS', 'NEED_REST'])
586         method_infos = []
587         for method_id in range(method_count):
588             param_count = u30()
589             u30()  # return type
590             for _ in range(param_count):
591                 u30()  # param type
592             u30()  # name index (always 0 for youtube)
593             flags = read_byte()
594             if flags & 0x08 != 0:
595                 # Options present
596                 option_count = u30()
597                 for c in range(option_count):
598                     u30()  # val
599                     read_bytes(1)  # kind
600             if flags & 0x80 != 0:
601                 # Param names present
602                 for _ in range(param_count):
603                     u30()  # param name
604             mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
605             method_infos.append(mi)
606
607         # Metadata
608         metadata_count = u30()
609         for _c in range(metadata_count):
610             u30()  # name
611             item_count = u30()
612             for _c2 in range(item_count):
613                 u30()  # key
614                 u30()  # value
615
616         def parse_traits_info():
617             trait_name_idx = u30()
618             kind_full = read_byte()
619             kind = kind_full & 0x0f
620             attrs = kind_full >> 4
621             methods = {}
622             if kind in [0x00, 0x06]:  # Slot or Const
623                 u30()  # Slot id
624                 u30()  # type_name_idx
625                 vindex = u30()
626                 if vindex != 0:
627                     read_byte()  # vkind
628             elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter
629                 u30()  # disp_id
630                 method_idx = u30()
631                 methods[multinames[trait_name_idx]] = method_idx
632             elif kind == 0x04:  # Class
633                 u30()  # slot_id
634                 u30()  # classi
635             elif kind == 0x05:  # Function
636                 u30()  # slot_id
637                 function_idx = u30()
638                 methods[function_idx] = multinames[trait_name_idx]
639             else:
640                 raise ExtractorError(u'Unsupported trait kind %d' % kind)
641
642             if attrs & 0x4 != 0:  # Metadata present
643                 metadata_count = u30()
644                 for _c3 in range(metadata_count):
645                     u30()  # metadata index
646
647             return methods
648
649         # Classes
650         TARGET_CLASSNAME = u'SignatureDecipher'
651         searched_idx = multinames.index(TARGET_CLASSNAME)
652         searched_class_id = None
653         class_count = u30()
654         for class_id in range(class_count):
655             name_idx = u30()
656             if name_idx == searched_idx:
657                 # We found the class we're looking for!
658                 searched_class_id = class_id
659             u30()  # super_name idx
660             flags = read_byte()
661             if flags & 0x08 != 0:  # Protected namespace is present
662                 u30()  # protected_ns_idx
663             intrf_count = u30()
664             for _c2 in range(intrf_count):
665                 u30()
666             u30()  # iinit
667             trait_count = u30()
668             for _c2 in range(trait_count):
669                 parse_traits_info()
670
671         if searched_class_id is None:
672             raise ExtractorError(u'Target class %r not found' %
673                                  TARGET_CLASSNAME)
674
675         method_names = {}
676         method_idxs = {}
677         for class_id in range(class_count):
678             u30()  # cinit
679             trait_count = u30()
680             for _c2 in range(trait_count):
681                 trait_methods = parse_traits_info()
682                 if class_id == searched_class_id:
683                     method_names.update(trait_methods.items())
684                     method_idxs.update(dict(
685                         (idx, name)
686                         for name, idx in trait_methods.items()))
687
688         # Scripts
689         script_count = u30()
690         for _c in range(script_count):
691             u30()  # init
692             trait_count = u30()
693             for _c2 in range(trait_count):
694                 parse_traits_info()
695
696         # Method bodies
697         method_body_count = u30()
698         Method = collections.namedtuple('Method', ['code', 'local_count'])
699         methods = {}
700         for _c in range(method_body_count):
701             method_idx = u30()
702             u30()  # max_stack
703             local_count = u30()
704             u30()  # init_scope_depth
705             u30()  # max_scope_depth
706             code_length = u30()
707             code = read_bytes(code_length)
708             if method_idx in method_idxs:
709                 m = Method(code, local_count)
710                 methods[method_idxs[method_idx]] = m
711             exception_count = u30()
712             for _c2 in range(exception_count):
713                 u30()  # from
714                 u30()  # to
715                 u30()  # target
716                 u30()  # exc_type
717                 u30()  # var_name
718             trait_count = u30()
719             for _c2 in range(trait_count):
720                 parse_traits_info()
721
722         assert p + code_reader.tell() == len(code_tag)
723         assert len(methods) == len(method_idxs)
724
725         method_pyfunctions = {}
726
727         def extract_function(func_name):
728             if func_name in method_pyfunctions:
729                 return method_pyfunctions[func_name]
730             if func_name not in methods:
731                 raise ExtractorError(u'Cannot find function %r' % func_name)
732             m = methods[func_name]
733
734             def resfunc(args):
735                 registers = ['(this)'] + list(args) + [None] * m.local_count
736                 stack = []
737                 coder = io.BytesIO(m.code)
738                 while True:
739                     opcode = struct.unpack('!B', coder.read(1))[0]
740                     if opcode == 36:  # pushbyte
741                         v = struct.unpack('!B', coder.read(1))[0]
742                         stack.append(v)
743                     elif opcode == 44:  # pushstring
744                         idx = u30(coder)
745                         stack.append(constant_strings[idx])
746                     elif opcode == 48:  # pushscope
747                         # We don't implement the scope register, so we'll just
748                         # ignore the popped value
749                         stack.pop()
750                     elif opcode == 70:  # callproperty
751                         index = u30(coder)
752                         mname = multinames[index]
753                         arg_count = u30(coder)
754                         args = list(reversed(
755                             [stack.pop() for _ in range(arg_count)]))
756                         obj = stack.pop()
757                         if mname == u'split':
758                             assert len(args) == 1
759                             assert isinstance(args[0], compat_str)
760                             assert isinstance(obj, compat_str)
761                             if args[0] == u'':
762                                 res = list(obj)
763                             else:
764                                 res = obj.split(args[0])
765                             stack.append(res)
766                         elif mname == u'slice':
767                             assert len(args) == 1
768                             assert isinstance(args[0], int)
769                             assert isinstance(obj, list)
770                             res = obj[args[0]:]
771                             stack.append(res)
772                         elif mname == u'join':
773                             assert len(args) == 1
774                             assert isinstance(args[0], compat_str)
775                             assert isinstance(obj, list)
776                             res = args[0].join(obj)
777                             stack.append(res)
778                         elif mname in method_pyfunctions:
779                             stack.append(method_pyfunctions[mname](args))
780                         else:
781                             raise NotImplementedError(
782                                 u'Unsupported property %r on %r'
783                                 % (mname, obj))
784                     elif opcode == 72:  # returnvalue
785                         res = stack.pop()
786                         return res
787                     elif opcode == 79:  # callpropvoid
788                         index = u30(coder)
789                         mname = multinames[index]
790                         arg_count = u30(coder)
791                         args = list(reversed(
792                             [stack.pop() for _ in range(arg_count)]))
793                         obj = stack.pop()
794                         if mname == u'reverse':
795                             assert isinstance(obj, list)
796                             obj.reverse()
797                         else:
798                             raise NotImplementedError(
799                                 u'Unsupported (void) property %r on %r'
800                                 % (mname, obj))
801                     elif opcode == 93:  # findpropstrict
802                         index = u30(coder)
803                         mname = multinames[index]
804                         res = extract_function(mname)
805                         stack.append(res)
806                     elif opcode == 97:  # setproperty
807                         index = u30(coder)
808                         value = stack.pop()
809                         idx = stack.pop()
810                         obj = stack.pop()
811                         assert isinstance(obj, list)
812                         assert isinstance(idx, int)
813                         obj[idx] = value
814                     elif opcode == 98:  # getlocal
815                         index = u30(coder)
816                         stack.append(registers[index])
817                     elif opcode == 99:  # setlocal
818                         index = u30(coder)
819                         value = stack.pop()
820                         registers[index] = value
821                     elif opcode == 102:  # getproperty
822                         index = u30(coder)
823                         pname = multinames[index]
824                         if pname == u'length':
825                             obj = stack.pop()
826                             assert isinstance(obj, list)
827                             stack.append(len(obj))
828                         else:  # Assume attribute access
829                             idx = stack.pop()
830                             assert isinstance(idx, int)
831                             obj = stack.pop()
832                             assert isinstance(obj, list)
833                             stack.append(obj[idx])
834                     elif opcode == 128:  # coerce
835                         u30(coder)
836                     elif opcode == 133:  # coerce_s
837                         assert isinstance(stack[-1], (type(None), compat_str))
838                     elif opcode == 164:  # modulo
839                         value2 = stack.pop()
840                         value1 = stack.pop()
841                         res = value1 % value2
842                         stack.append(res)
843                     elif opcode == 208:  # getlocal_0
844                         stack.append(registers[0])
845                     elif opcode == 209:  # getlocal_1
846                         stack.append(registers[1])
847                     elif opcode == 210:  # getlocal_2
848                         stack.append(registers[2])
849                     elif opcode == 211:  # getlocal_3
850                         stack.append(registers[3])
851                     elif opcode == 214:  # setlocal_2
852                         registers[2] = stack.pop()
853                     elif opcode == 215:  # setlocal_3
854                         registers[3] = stack.pop()
855                     else:
856                         raise NotImplementedError(
857                             u'Unsupported opcode %d' % opcode)
858
859             method_pyfunctions[func_name] = resfunc
860             return resfunc
861
862         initial_function = extract_function(u'decipher')
863         return lambda s: initial_function([s])
864
865     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
866         """Turn the encrypted s field into a working signature"""
867
868         if player_url is not None:
869             if player_url.startswith(u'//'):
870                 player_url = u'https:' + player_url
871             try:
872                 player_id = (player_url, len(s))
873                 if player_id not in self._player_cache:
874                     func = self._extract_signature_function(
875                         video_id, player_url, len(s)
876                     )
877                     self._player_cache[player_id] = func
878                 func = self._player_cache[player_id]
879                 if self._downloader.params.get('youtube_print_sig_code'):
880                     self._print_sig_code(func, len(s))
881                 return func(s)
882             except Exception:
883                 tb = traceback.format_exc()
884                 self._downloader.report_warning(
885                     u'Automatic signature extraction failed: ' + tb)
886
887             self._downloader.report_warning(
888                 u'Warning: Falling back to static signature algorithm')
889
890         return self._static_decrypt_signature(
891             s, video_id, player_url, age_gate)
892
893     def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
894         if age_gate:
895             # The videos with age protection use another player, so the
896             # algorithms can be different.
897             if len(s) == 86:
898                 return s[2:63] + s[82] + s[64:82] + s[63]
899
900         if len(s) == 93:
901             return s[86:29:-1] + s[88] + s[28:5:-1]
902         elif len(s) == 92:
903             return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
904         elif len(s) == 91:
905             return s[84:27:-1] + s[86] + s[26:5:-1]
906         elif len(s) == 90:
907             return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
908         elif len(s) == 89:
909             return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
910         elif len(s) == 88:
911             return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
912         elif len(s) == 87:
913             return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
914         elif len(s) == 86:
915             return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
916         elif len(s) == 85:
917             return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
918         elif len(s) == 84:
919             return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
920         elif len(s) == 83:
921             return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
922         elif len(s) == 82:
923             return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
924         elif len(s) == 81:
925             return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
926         elif len(s) == 80:
927             return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
928         elif len(s) == 79:
929             return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
930
931         else:
932             raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
933
934     def _get_available_subtitles(self, video_id, webpage):
935         try:
936             sub_list = self._download_webpage(
937                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
938                 video_id, note=False)
939         except ExtractorError as err:
940             self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
941             return {}
942         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
943
944         sub_lang_list = {}
945         for l in lang_list:
946             lang = l[1]
947             params = compat_urllib_parse.urlencode({
948                 'lang': lang,
949                 'v': video_id,
950                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
951                 'name': unescapeHTML(l[0]).encode('utf-8'),
952             })
953             url = u'https://www.youtube.com/api/timedtext?' + params
954             sub_lang_list[lang] = url
955         if not sub_lang_list:
956             self._downloader.report_warning(u'video doesn\'t have subtitles')
957             return {}
958         return sub_lang_list
959
960     def _get_available_automatic_caption(self, video_id, webpage):
961         """We need the webpage for getting the captions url, pass it as an
962            argument to speed up the process."""
963         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
964         self.to_screen(u'%s: Looking for automatic captions' % video_id)
965         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
966         err_msg = u'Couldn\'t find automatic captions for %s' % video_id
967         if mobj is None:
968             self._downloader.report_warning(err_msg)
969             return {}
970         player_config = json.loads(mobj.group(1))
971         try:
972             args = player_config[u'args']
973             caption_url = args[u'ttsurl']
974             timestamp = args[u'timestamp']
975             # We get the available subtitles
976             list_params = compat_urllib_parse.urlencode({
977                 'type': 'list',
978                 'tlangs': 1,
979                 'asrs': 1,
980             })
981             list_url = caption_url + '&' + list_params
982             caption_list = self._download_xml(list_url, video_id)
983             original_lang_node = caption_list.find('track')
984             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
985                 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
986                 return {}
987             original_lang = original_lang_node.attrib['lang_code']
988
989             sub_lang_list = {}
990             for lang_node in caption_list.findall('target'):
991                 sub_lang = lang_node.attrib['lang_code']
992                 params = compat_urllib_parse.urlencode({
993                     'lang': original_lang,
994                     'tlang': sub_lang,
995                     'fmt': sub_format,
996                     'ts': timestamp,
997                     'kind': 'asr',
998                 })
999                 sub_lang_list[sub_lang] = caption_url + '&' + params
1000             return sub_lang_list
1001         # An extractor error can be raise by the download process if there are
1002         # no automatic captions but there are subtitles
1003         except (KeyError, ExtractorError):
1004             self._downloader.report_warning(err_msg)
1005             return {}
1006
1007     @classmethod
1008     def extract_id(cls, url):
1009         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1010         if mobj is None:
1011             raise ExtractorError(u'Invalid URL: %s' % url)
1012         video_id = mobj.group(2)
1013         return video_id
1014
1015     def _extract_from_m3u8(self, manifest_url, video_id):
1016         url_map = {}
1017         def _get_urls(_manifest):
1018             lines = _manifest.split('\n')
1019             urls = filter(lambda l: l and not l.startswith('#'),
1020                             lines)
1021             return urls
1022         manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1023         formats_urls = _get_urls(manifest)
1024         for format_url in formats_urls:
1025             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1026             url_map[itag] = format_url
1027         return url_map
1028
1029     def _extract_annotations(self, video_id):
1030         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1031         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1032
1033     def _real_extract(self, url):
1034         proto = (
1035             u'http' if self._downloader.params.get('prefer_insecure', False)
1036             else u'https')
1037
1038         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1039         mobj = re.search(self._NEXT_URL_RE, url)
1040         if mobj:
1041             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1042         video_id = self.extract_id(url)
1043
1044         # Get video webpage
1045         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1046         video_webpage = self._download_webpage(url, video_id)
1047
1048         # Attempt to extract SWF player URL
1049         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1050         if mobj is not None:
1051             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1052         else:
1053             player_url = None
1054
1055         # Get video info
1056         self.report_video_info_webpage_download(video_id)
1057         if re.search(r'player-age-gate-content">', video_webpage) is not None:
1058             self.report_age_confirmation()
1059             age_gate = True
1060             # We simulate the access to the video from www.youtube.com/v/{video_id}
1061             # this can be viewed without login into Youtube
1062             data = compat_urllib_parse.urlencode({'video_id': video_id,
1063                                                   'el': 'player_embedded',
1064                                                   'gl': 'US',
1065                                                   'hl': 'en',
1066                                                   'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1067                                                   'asv': 3,
1068                                                   'sts':'1588',
1069                                                   })
1070             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1071             video_info_webpage = self._download_webpage(video_info_url, video_id,
1072                                     note=False,
1073                                     errnote='unable to download video info webpage')
1074             video_info = compat_parse_qs(video_info_webpage)
1075         else:
1076             age_gate = False
1077             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1078                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1079                         % (video_id, el_type))
1080                 video_info_webpage = self._download_webpage(video_info_url, video_id,
1081                                         note=False,
1082                                         errnote='unable to download video info webpage')
1083                 video_info = compat_parse_qs(video_info_webpage)
1084                 if 'token' in video_info:
1085                     break
1086         if 'token' not in video_info:
1087             if 'reason' in video_info:
1088                 raise ExtractorError(
1089                     u'YouTube said: %s' % video_info['reason'][0],
1090                     expected=True, video_id=video_id)
1091             else:
1092                 raise ExtractorError(
1093                     u'"token" parameter not in video info for unknown reason',
1094                     video_id=video_id)
1095
1096         if 'view_count' in video_info:
1097             view_count = int(video_info['view_count'][0])
1098         else:
1099             view_count = None
1100
1101         # Check for "rental" videos
1102         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1103             raise ExtractorError(u'"rental" videos not supported')
1104
1105         # Start extracting information
1106         self.report_information_extraction(video_id)
1107
1108         # uploader
1109         if 'author' not in video_info:
1110             raise ExtractorError(u'Unable to extract uploader name')
1111         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1112
1113         # uploader_id
1114         video_uploader_id = None
1115         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1116         if mobj is not None:
1117             video_uploader_id = mobj.group(1)
1118         else:
1119             self._downloader.report_warning(u'unable to extract uploader nickname')
1120
1121         # title
1122         if 'title' in video_info:
1123             video_title = video_info['title'][0]
1124         else:
1125             self._downloader.report_warning(u'Unable to extract video title')
1126             video_title = u'_'
1127
1128         # thumbnail image
1129         # We try first to get a high quality image:
1130         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1131                             video_webpage, re.DOTALL)
1132         if m_thumb is not None:
1133             video_thumbnail = m_thumb.group(1)
1134         elif 'thumbnail_url' not in video_info:
1135             self._downloader.report_warning(u'unable to extract video thumbnail')
1136             video_thumbnail = None
1137         else:   # don't panic if we can't find it
1138             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1139
1140         # upload date
1141         upload_date = None
1142         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
1143         if mobj is None:
1144             mobj = re.search(
1145                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
1146                 video_webpage)
1147         if mobj is not None:
1148             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1149             upload_date = unified_strdate(upload_date)
1150
1151         m_cat_container = get_element_by_id("eow-category", video_webpage)
1152         if m_cat_container:
1153             category = self._html_search_regex(
1154                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1155                 default=None)
1156             video_categories = None if category is None else [category]
1157         else:
1158             video_categories = None
1159
1160         # description
1161         video_description = get_element_by_id("eow-description", video_webpage)
1162         if video_description:
1163             video_description = re.sub(r'''(?x)
1164                 <a\s+
1165                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1166                     title="([^"]+)"\s+
1167                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1168                     class="yt-uix-redirect-link"\s*>
1169                 [^<]+
1170                 </a>
1171             ''', r'\1', video_description)
1172             video_description = clean_html(video_description)
1173         else:
1174             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1175             if fd_mobj:
1176                 video_description = unescapeHTML(fd_mobj.group(1))
1177             else:
1178                 video_description = u''
1179
1180         def _extract_count(klass):
1181             count = self._search_regex(
1182                 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1183                 video_webpage, klass, default=None)
1184             if count is not None:
1185                 return int(count.replace(',', ''))
1186             return None
1187         like_count = _extract_count(u'likes-count')
1188         dislike_count = _extract_count(u'dislikes-count')
1189
1190         # subtitles
1191         video_subtitles = self.extract_subtitles(video_id, video_webpage)
1192
1193         if self._downloader.params.get('listsubtitles', False):
1194             self._list_available_subtitles(video_id, video_webpage)
1195             return
1196
1197         if 'length_seconds' not in video_info:
1198             self._downloader.report_warning(u'unable to extract video duration')
1199             video_duration = None
1200         else:
1201             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1202
1203         # annotations
1204         video_annotations = None
1205         if self._downloader.params.get('writeannotations', False):
1206                 video_annotations = self._extract_annotations(video_id)
1207
1208         # Decide which formats to download
1209         try:
1210             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1211             if not mobj:
1212                 raise ValueError('Could not find vevo ID')
1213             json_code = uppercase_escape(mobj.group(1))
1214             ytplayer_config = json.loads(json_code)
1215             args = ytplayer_config['args']
1216             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1217             # this signatures are encrypted
1218             if 'url_encoded_fmt_stream_map' not in args:
1219                 raise ValueError(u'No stream_map present')  # caught below
1220             re_signature = re.compile(r'[&,]s=')
1221             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1222             if m_s is not None:
1223                 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1224                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1225             m_s = re_signature.search(args.get('adaptive_fmts', u''))
1226             if m_s is not None:
1227                 if 'adaptive_fmts' in video_info:
1228                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1229                 else:
1230                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1231         except ValueError:
1232             pass
1233
1234         def _map_to_format_list(urlmap):
1235             formats = []
1236             for itag, video_real_url in urlmap.items():
1237                 dct = {
1238                     'format_id': itag,
1239                     'url': video_real_url,
1240                     'player_url': player_url,
1241                 }
1242                 if itag in self._formats:
1243                     dct.update(self._formats[itag])
1244                 formats.append(dct)
1245             return formats
1246
1247         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1248             self.report_rtmp_download()
1249             formats = [{
1250                 'format_id': '_rtmp',
1251                 'protocol': 'rtmp',
1252                 'url': video_info['conn'][0],
1253                 'player_url': player_url,
1254             }]
1255         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1256             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1257             if 'rtmpe%3Dyes' in encoded_url_map:
1258                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1259             url_map = {}
1260             for url_data_str in encoded_url_map.split(','):
1261                 url_data = compat_parse_qs(url_data_str)
1262                 if 'itag' in url_data and 'url' in url_data:
1263                     url = url_data['url'][0]
1264                     if 'sig' in url_data:
1265                         url += '&signature=' + url_data['sig'][0]
1266                     elif 's' in url_data:
1267                         encrypted_sig = url_data['s'][0]
1268                         if self._downloader.params.get('verbose'):
1269                             if age_gate:
1270                                 if player_url is None:
1271                                     player_version = 'unknown'
1272                                 else:
1273                                     player_version = self._search_regex(
1274                                         r'-(.+)\.swf$', player_url,
1275                                         u'flash player', fatal=False)
1276                                 player_desc = 'flash player %s' % player_version
1277                             else:
1278                                 player_version = self._search_regex(
1279                                     r'html5player-(.+?)\.js', video_webpage,
1280                                     'html5 player', fatal=False)
1281                                 player_desc = u'html5 player %s' % player_version
1282
1283                             parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1284                             self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1285                                 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1286
1287                         if not age_gate:
1288                             jsplayer_url_json = self._search_regex(
1289                                 r'"assets":.+?"js":\s*("[^"]+")',
1290                                 video_webpage, u'JS player URL')
1291                             player_url = json.loads(jsplayer_url_json)
1292
1293                         signature = self._decrypt_signature(
1294                             encrypted_sig, video_id, player_url, age_gate)
1295                         url += '&signature=' + signature
1296                     if 'ratebypass' not in url:
1297                         url += '&ratebypass=yes'
1298                     url_map[url_data['itag'][0]] = url
1299             formats = _map_to_format_list(url_map)
1300         elif video_info.get('hlsvp'):
1301             manifest_url = video_info['hlsvp'][0]
1302             url_map = self._extract_from_m3u8(manifest_url, video_id)
1303             formats = _map_to_format_list(url_map)
1304         else:
1305             raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1306
1307         # Look for the DASH manifest
1308         if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1309             try:
1310                 # The DASH manifest used needs to be the one from the original video_webpage.
1311                 # The one found in get_video_info seems to be using different signatures.
1312                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1313                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1314                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1315                 if age_gate:
1316                     dash_manifest_url = video_info.get('dashmpd')[0]
1317                 else:
1318                     dash_manifest_url = ytplayer_config['args']['dashmpd']
1319                 def decrypt_sig(mobj):
1320                     s = mobj.group(1)
1321                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1322                     return '/signature/%s' % dec_s
1323                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1324                 dash_doc = self._download_xml(
1325                     dash_manifest_url, video_id,
1326                     note=u'Downloading DASH manifest',
1327                     errnote=u'Could not download DASH manifest')
1328                 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1329                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1330                     if url_el is None:
1331                         continue
1332                     format_id = r.attrib['id']
1333                     video_url = url_el.text
1334                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1335                     f = {
1336                         'format_id': format_id,
1337                         'url': video_url,
1338                         'width': int_or_none(r.attrib.get('width')),
1339                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1340                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1341                         'filesize': filesize,
1342                     }
1343                     try:
1344                         existing_format = next(
1345                             fo for fo in formats
1346                             if fo['format_id'] == format_id)
1347                     except StopIteration:
1348                         f.update(self._formats.get(format_id, {}))
1349                         formats.append(f)
1350                     else:
1351                         existing_format.update(f)
1352
1353             except (ExtractorError, KeyError) as e:
1354                 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1355
1356         self._sort_formats(formats)
1357
1358         return {
1359             'id':           video_id,
1360             'uploader':     video_uploader,
1361             'uploader_id':  video_uploader_id,
1362             'upload_date':  upload_date,
1363             'title':        video_title,
1364             'thumbnail':    video_thumbnail,
1365             'description':  video_description,
1366             'categories':   video_categories,
1367             'subtitles':    video_subtitles,
1368             'duration':     video_duration,
1369             'age_limit':    18 if age_gate else 0,
1370             'annotations':  video_annotations,
1371             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1372             'view_count':   view_count,
1373             'like_count': like_count,
1374             'dislike_count': dislike_count,
1375             'formats':      formats,
1376         }
1377
1378 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1379     IE_DESC = u'YouTube.com playlists'
1380     _VALID_URL = r"""(?x)(?:
1381                         (?:https?://)?
1382                         (?:\w+\.)?
1383                         youtube\.com/
1384                         (?:
1385                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1386                            \? (?:.*?&)*? (?:p|a|list)=
1387                         |  p/
1388                         )
1389                         (
1390                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1391                             # Top tracks, they can also include dots 
1392                             |(?:MC)[\w\.]*
1393                         )
1394                         .*
1395                      |
1396                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1397                      )"""
1398     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1399     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1400     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1401     IE_NAME = u'youtube:playlist'
1402
1403     def _real_initialize(self):
1404         self._login()
1405
1406     def _ids_to_results(self, ids):
1407         return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1408                        for vid_id in ids]
1409
1410     def _extract_mix(self, playlist_id):
1411         # The mixes are generated from a a single video
1412         # the id of the playlist is just 'RD' + video_id
1413         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1414         webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1415         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1416         title_span = (search_title('playlist-title') or
1417             search_title('title long-title') or search_title('title'))
1418         title = clean_html(title_span)
1419         video_re = r'''(?x)data-video-username=".*?".*?
1420                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1421         ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1422         url_results = self._ids_to_results(ids)
1423
1424         return self.playlist_result(url_results, playlist_id, title)
1425
1426     def _real_extract(self, url):
1427         # Extract playlist id
1428         mobj = re.match(self._VALID_URL, url)
1429         if mobj is None:
1430             raise ExtractorError(u'Invalid URL: %s' % url)
1431         playlist_id = mobj.group(1) or mobj.group(2)
1432
1433         # Check if it's a video-specific URL
1434         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1435         if 'v' in query_dict:
1436             video_id = query_dict['v'][0]
1437             if self._downloader.params.get('noplaylist'):
1438                 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1439                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1440             else:
1441                 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1442
1443         if playlist_id.startswith('RD'):
1444             # Mixes require a custom extraction process
1445             return self._extract_mix(playlist_id)
1446         if playlist_id.startswith('TL'):
1447             raise ExtractorError(u'For downloading YouTube.com top lists, use '
1448                 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1449
1450         url = self._TEMPLATE_URL % playlist_id
1451         page = self._download_webpage(url, playlist_id)
1452         more_widget_html = content_html = page
1453
1454         # Check if the playlist exists or is private
1455         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1456             raise ExtractorError(
1457                 u'The playlist doesn\'t exist or is private, use --username or '
1458                 '--netrc to access it.',
1459                 expected=True)
1460
1461         # Extract the video ids from the playlist pages
1462         ids = []
1463
1464         for page_num in itertools.count(1):
1465             matches = re.finditer(self._VIDEO_RE, content_html)
1466             # We remove the duplicates and the link with index 0
1467             # (it's not the first video of the playlist)
1468             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1469             ids.extend(new_ids)
1470
1471             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1472             if not mobj:
1473                 break
1474
1475             more = self._download_json(
1476                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1477                 'Downloading page #%s' % page_num,
1478                 transform_source=uppercase_escape)
1479             content_html = more['content_html']
1480             more_widget_html = more['load_more_widget_html']
1481
1482         playlist_title = self._html_search_regex(
1483             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1484             page, u'title')
1485
1486         url_results = self._ids_to_results(ids)
1487         return self.playlist_result(url_results, playlist_id, playlist_title)
1488
1489
1490 class YoutubeTopListIE(YoutubePlaylistIE):
1491     IE_NAME = u'youtube:toplist'
1492     IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1493         u' (Example: "yttoplist:music:Top Tracks")')
1494     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1495
1496     def _real_extract(self, url):
1497         mobj = re.match(self._VALID_URL, url)
1498         channel = mobj.group('chann')
1499         title = mobj.group('title')
1500         query = compat_urllib_parse.urlencode({'title': title})
1501         playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1502         channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1503         link = self._html_search_regex(playlist_re, channel_page, u'list')
1504         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1505         
1506         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1507         ids = []
1508         # sometimes the webpage doesn't contain the videos
1509         # retry until we get them
1510         for i in itertools.count(0):
1511             msg = u'Downloading Youtube mix'
1512             if i > 0:
1513                 msg += ', retry #%d' % i
1514             webpage = self._download_webpage(url, title, msg)
1515             ids = orderedSet(re.findall(video_re, webpage))
1516             if ids:
1517                 break
1518         url_results = self._ids_to_results(ids)
1519         return self.playlist_result(url_results, playlist_title=title)
1520
1521
1522 class YoutubeChannelIE(InfoExtractor):
1523     IE_DESC = u'YouTube.com channels'
1524     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1525     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1526     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1527     IE_NAME = u'youtube:channel'
1528
1529     def extract_videos_from_page(self, page):
1530         ids_in_page = []
1531         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1532             if mobj.group(1) not in ids_in_page:
1533                 ids_in_page.append(mobj.group(1))
1534         return ids_in_page
1535
1536     def _real_extract(self, url):
1537         # Extract channel id
1538         mobj = re.match(self._VALID_URL, url)
1539         if mobj is None:
1540             raise ExtractorError(u'Invalid URL: %s' % url)
1541
1542         # Download channel page
1543         channel_id = mobj.group(1)
1544         video_ids = []
1545         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1546         channel_page = self._download_webpage(url, channel_id)
1547         autogenerated = re.search(r'''(?x)
1548                 class="[^"]*?(?:
1549                     channel-header-autogenerated-label|
1550                     yt-channel-title-autogenerated
1551                 )[^"]*"''', channel_page) is not None
1552
1553         if autogenerated:
1554             # The videos are contained in a single page
1555             # the ajax pages can't be used, they are empty
1556             video_ids = self.extract_videos_from_page(channel_page)
1557         else:
1558             # Download all channel pages using the json-based channel_ajax query
1559             for pagenum in itertools.count(1):
1560                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1561                 page = self._download_json(
1562                     url, channel_id, note=u'Downloading page #%s' % pagenum,
1563                     transform_source=uppercase_escape)
1564
1565                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1566                 video_ids.extend(ids_in_page)
1567     
1568                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1569                     break
1570
1571         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1572
1573         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1574                        for video_id in video_ids]
1575         return self.playlist_result(url_entries, channel_id)
1576
1577
1578 class YoutubeUserIE(InfoExtractor):
1579     IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1580     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1581     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1582     _GDATA_PAGE_SIZE = 50
1583     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1584     IE_NAME = u'youtube:user'
1585
1586     @classmethod
1587     def suitable(cls, url):
1588         # Don't return True if the url can be extracted with other youtube
1589         # extractor, the regex would is too permissive and it would match.
1590         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1591         if any(ie.suitable(url) for ie in other_ies): return False
1592         else: return super(YoutubeUserIE, cls).suitable(url)
1593
1594     def _real_extract(self, url):
1595         # Extract username
1596         mobj = re.match(self._VALID_URL, url)
1597         if mobj is None:
1598             raise ExtractorError(u'Invalid URL: %s' % url)
1599
1600         username = mobj.group(1)
1601
1602         # Download video ids using YouTube Data API. Result size per
1603         # query is limited (currently to 50 videos) so we need to query
1604         # page by page until there are no video ids - it means we got
1605         # all of them.
1606
1607         def download_page(pagenum):
1608             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1609
1610             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1611             page = self._download_webpage(
1612                 gdata_url, username,
1613                 u'Downloading video ids from %d to %d' % (
1614                     start_index, start_index + self._GDATA_PAGE_SIZE))
1615
1616             try:
1617                 response = json.loads(page)
1618             except ValueError as err:
1619                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1620             if 'entry' not in response['feed']:
1621                 return
1622
1623             # Extract video identifiers
1624             entries = response['feed']['entry']
1625             for entry in entries:
1626                 title = entry['title']['$t']
1627                 video_id = entry['id']['$t'].split('/')[-1]
1628                 yield {
1629                     '_type': 'url',
1630                     'url': video_id,
1631                     'ie_key': 'Youtube',
1632                     'id': video_id,
1633                     'title': title,
1634                 }
1635         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1636
1637         return self.playlist_result(url_results, playlist_title=username)
1638
1639
1640 class YoutubeSearchIE(SearchInfoExtractor):
1641     IE_DESC = u'YouTube.com searches'
1642     _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1643     _MAX_RESULTS = 1000
1644     IE_NAME = u'youtube:search'
1645     _SEARCH_KEY = 'ytsearch'
1646
1647     def _get_n_results(self, query, n):
1648         """Get a specified number of results for a query"""
1649
1650         video_ids = []
1651         pagenum = 0
1652         limit = n
1653         PAGE_SIZE = 50
1654
1655         while (PAGE_SIZE * pagenum) < limit:
1656             result_url = self._API_URL % (
1657                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1658                 (PAGE_SIZE * pagenum) + 1)
1659             data_json = self._download_webpage(
1660                 result_url, video_id=u'query "%s"' % query,
1661                 note=u'Downloading page %s' % (pagenum + 1),
1662                 errnote=u'Unable to download API page')
1663             data = json.loads(data_json)
1664             api_response = data['data']
1665
1666             if 'items' not in api_response:
1667                 raise ExtractorError(
1668                     u'[youtube] No video results', expected=True)
1669
1670             new_ids = list(video['id'] for video in api_response['items'])
1671             video_ids += new_ids
1672
1673             limit = min(n, api_response['totalItems'])
1674             pagenum += 1
1675
1676         if len(video_ids) > n:
1677             video_ids = video_ids[:n]
1678         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1679                   for video_id in video_ids]
1680         return self.playlist_result(videos, query)
1681
1682
1683 class YoutubeSearchDateIE(YoutubeSearchIE):
1684     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1685     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1686     _SEARCH_KEY = 'ytsearchdate'
1687     IE_DESC = u'YouTube.com searches, newest videos first'
1688
1689
1690 class YoutubeSearchURLIE(InfoExtractor):
1691     IE_DESC = u'YouTube.com search URLs'
1692     IE_NAME = u'youtube:search_url'
1693     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1694
1695     def _real_extract(self, url):
1696         mobj = re.match(self._VALID_URL, url)
1697         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1698
1699         webpage = self._download_webpage(url, query)
1700         result_code = self._search_regex(
1701             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
1702
1703         part_codes = re.findall(
1704             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1705         entries = []
1706         for part_code in part_codes:
1707             part_title = self._html_search_regex(
1708                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1709             part_url_snippet = self._html_search_regex(
1710                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1711             part_url = compat_urlparse.urljoin(
1712                 'https://www.youtube.com/', part_url_snippet)
1713             entries.append({
1714                 '_type': 'url',
1715                 'url': part_url,
1716                 'title': part_title,
1717             })
1718
1719         return {
1720             '_type': 'playlist',
1721             'entries': entries,
1722             'title': query,
1723         }
1724
1725
1726 class YoutubeShowIE(InfoExtractor):
1727     IE_DESC = u'YouTube.com (multi-season) shows'
1728     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1729     IE_NAME = u'youtube:show'
1730
1731     def _real_extract(self, url):
1732         mobj = re.match(self._VALID_URL, url)
1733         show_name = mobj.group(1)
1734         webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1735         # There's one playlist for each season of the show
1736         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1737         self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1738         return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1739
1740
1741 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1742     """
1743     Base class for extractors that fetch info from
1744     http://www.youtube.com/feed_ajax
1745     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1746     """
1747     _LOGIN_REQUIRED = True
1748     # use action_load_personal_feed instead of action_load_system_feed
1749     _PERSONAL_FEED = False
1750
1751     @property
1752     def _FEED_TEMPLATE(self):
1753         action = 'action_load_system_feed'
1754         if self._PERSONAL_FEED:
1755             action = 'action_load_personal_feed'
1756         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1757
1758     @property
1759     def IE_NAME(self):
1760         return u'youtube:%s' % self._FEED_NAME
1761
1762     def _real_initialize(self):
1763         self._login()
1764
1765     def _real_extract(self, url):
1766         feed_entries = []
1767         paging = 0
1768         for i in itertools.count(1):
1769             info = self._download_json(self._FEED_TEMPLATE % paging,
1770                                           u'%s feed' % self._FEED_NAME,
1771                                           u'Downloading page %s' % i)
1772             feed_html = info.get('feed_html') or info.get('content_html')
1773             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1774             ids = orderedSet(m.group(1) for m in m_ids)
1775             feed_entries.extend(
1776                 self.url_result(video_id, 'Youtube', video_id=video_id)
1777                 for video_id in ids)
1778             mobj = re.search(
1779                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1780                 feed_html)
1781             if mobj is None:
1782                 break
1783             paging = mobj.group('paging')
1784         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1785
1786 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1787     IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1788     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1789     _FEED_NAME = 'subscriptions'
1790     _PLAYLIST_TITLE = u'Youtube Subscriptions'
1791
1792 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1793     IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1794     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1795     _FEED_NAME = 'recommended'
1796     _PLAYLIST_TITLE = u'Youtube Recommended videos'
1797
1798 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1799     IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1800     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1801     _FEED_NAME = 'watch_later'
1802     _PLAYLIST_TITLE = u'Youtube Watch Later'
1803     _PERSONAL_FEED = True
1804
1805 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1806     IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1807     _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1808     _FEED_NAME = 'history'
1809     _PERSONAL_FEED = True
1810     _PLAYLIST_TITLE = u'Youtube Watch History'
1811
1812 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1813     IE_NAME = u'youtube:favorites'
1814     IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1815     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1816     _LOGIN_REQUIRED = True
1817
1818     def _real_extract(self, url):
1819         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1820         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1821         return self.url_result(playlist_id, 'YoutubePlaylist')
1822
1823
1824 class YoutubeTruncatedURLIE(InfoExtractor):
1825     IE_NAME = 'youtube:truncated_url'
1826     IE_DESC = False  # Do not list
1827     _VALID_URL = r'''(?x)
1828         (?:https?://)?[^/]+/watch\?(?:
1829             feature=[a-z_]+|
1830             annotation_id=annotation_[^&]+
1831         )?$|
1832         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1833     '''
1834
1835     _TESTS = [{
1836         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1837         'only_matching': True,
1838     }, {
1839         'url': 'http://www.youtube.com/watch?',
1840         'only_matching': True,
1841     }]
1842
1843     def _real_extract(self, url):
1844         raise ExtractorError(
1845             u'Did you forget to quote the URL? Remember that & is a meta '
1846             u'character in most shells, so you want to put the URL in quotes, '
1847             u'like  youtube-dl '
1848             u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1849             u' or simply  youtube-dl BaW_jenozKc  .',
1850             expected=True)