_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_HTTPError,
  20     compat_http_client,
  21     compat_urllib_error,
  22     compat_urllib_parse,
  23     compat_urllib_parse_urlparse,
  24     compat_urllib_request,
  25     compat_urlparse,
  26     compat_str,
  27 )
  28 from ..utils import (
  29     NO_DEFAULT,
  30     age_restricted,
  31     bug_reports_message,
  32     clean_html,
  33     compiled_regex_type,
  34     determine_ext,
  35     ExtractorError,
  36     fix_xml_ampersands,
  37     float_or_none,
  38     int_or_none,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     unescapeHTML,
  42     url_basename,
  43     xpath_text,
  44     xpath_with_ns,
  45 )
  46
  47
  48 class InfoExtractor(object):
  49     """Information Extractor class.
  50
  51     Information extractors are the classes that, given a URL, extract
  52     information about the video (or videos) the URL refers to. This
  53     information includes the real video URL, the video title, author and
  54     others. The information is stored in a dictionary which is then
  55     passed to the YoutubeDL. The YoutubeDL processes this
  56     information possibly downloading the video to the file system, among
  57     other possible outcomes.
  58
  59     The type field determines the type of the result.
  60     By far the most common value (and the default if _type is missing) is
  61     "video", which indicates a single video.
  62
  63     For a video, the dictionaries must include the following fields:
  64
  65     id:             Video identifier.
  66     title:          Video title, unescaped.
  67
  68     Additionally, it must contain either a formats entry or a url one:
  69
  70     formats:        A list of dictionaries for each format available, ordered
  71                     from worst to best quality.
  72
  73                     Potential fields:
  74                     * url        Mandatory. The URL of the video file
  75                     * ext        Will be calculated from URL if missing
  76                     * format     A human-readable description of the format
  77                                  ("mp4 container with h264/opus").
  78                                  Calculated from the format_id, width, height.
  79                                  and format_note fields if missing.
  80                     * format_id  A short description of the format
  81                                  ("mp4_h264_opus" or "19").
  82                                 Technically optional, but strongly recommended.
  83                     * format_note Additional info about the format
  84                                  ("3D" or "DASH video")
  85                     * width      Width of the video, if known
  86                     * height     Height of the video, if known
  87                     * resolution Textual description of width and height
  88                     * tbr        Average bitrate of audio and video in KBit/s
  89                     * abr        Average audio bitrate in KBit/s
  90                     * acodec     Name of the audio codec in use
  91                     * asr        Audio sampling rate in Hertz
  92                     * vbr        Average video bitrate in KBit/s
  93                     * fps        Frame rate
  94                     * vcodec     Name of the video codec in use
  95                     * container  Name of the container format
  96                     * filesize   The number of bytes, if known in advance
  97                     * filesize_approx  An estimate for the number of bytes
  98                     * player_url SWF Player URL (used for rtmpdump).
  99                     * protocol   The protocol that will be used for the actual
 100                                  download, lower-case.
 101                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 102                                  "m3u8", or "m3u8_native".
 103                     * preference Order number of this format. If this field is
 104                                  present and not None, the formats get sorted
 105                                  by this field, regardless of all other values.
 106                                  -1 for default (order by other properties),
 107                                  -2 or smaller for less than default.
 108                                  < -1000 to hide the format (if there is
 109                                     another one which is strictly better)
 110                     * language_preference  Is this in the correct requested
 111                                  language?
 112                                  10 if it's what the URL is about,
 113                                  -1 for default (don't know),
 114                                  -10 otherwise, other values reserved for now.
 115                     * quality    Order number of the video quality of this
 116                                  format, irrespective of the file format.
 117                                  -1 for default (order by other properties),
 118                                  -2 or smaller for less than default.
 119                     * source_preference  Order number for this video source
 120                                   (quality takes higher priority)
 121                                  -1 for default (order by other properties),
 122                                  -2 or smaller for less than default.
 123                     * http_headers  A dictionary of additional HTTP headers
 124                                  to add to the request.
 125                     * stretched_ratio  If given and not 1, indicates that the
 126                                  video's pixels are not square.
 127                                  width : height ratio as float.
 128                     * no_resume  The server does not support resuming the
 129                                  (HTTP or RTMP) download. Boolean.
 130
 131     url:            Final video URL.
 132     ext:            Video filename extension.
 133     format:         The video format, defaults to ext (used for --get-format)
 134     player_url:     SWF Player URL (used for rtmpdump).
 135
 136     The following fields are optional:
 137
 138     alt_title:      A secondary title of the video.
 139     display_id      An alternative identifier for the video, not necessarily
 140                     unique, but available before title. Typically, id is
 141                     something like "4234987", title "Dancing naked mole rats",
 142                     and display_id "dancing-naked-mole-rats"
 143     thumbnails:     A list of dictionaries, with the following entries:
 144                         * "id" (optional, string) - Thumbnail format ID
 145                         * "url"
 146                         * "preference" (optional, int) - quality of the image
 147                         * "width" (optional, int)
 148                         * "height" (optional, int)
 149                         * "resolution" (optional, string "{width}x{height"},
 150                                         deprecated)
 151     thumbnail:      Full URL to a video thumbnail image.
 152     description:    Full video description.
 153     uploader:       Full name of the video uploader.
 154     creator:        The main artist who created the video.
 155     release_date:   The date (YYYYMMDD) when the video was released.
 156     timestamp:      UNIX timestamp of the moment the video became available.
 157     upload_date:    Video upload date (YYYYMMDD).
 158                     If not explicitly set, calculated from timestamp.
 159     uploader_id:    Nickname or id of the video uploader.
 160     location:       Physical location where the video was filmed.
 161     subtitles:      The available subtitles as a dictionary in the format
 162                     {language: subformats}. "subformats" is a list sorted from
 163                     lower to higher preference, each element is a dictionary
 164                     with the "ext" entry and one of:
 165                         * "data": The subtitles file contents
 166                         * "url": A URL pointing to the subtitles file
 167     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 168                     automatically generated captions
 169     duration:       Length of the video in seconds, as an integer.
 170     view_count:     How many users have watched the video on the platform.
 171     like_count:     Number of positive ratings of the video
 172     dislike_count:  Number of negative ratings of the video
 173     average_rating: Average rating give by users, the scale used depends on the webpage
 174     comment_count:  Number of comments on the video
 175     comments:       A list of comments, each with one or more of the following
 176                     properties (all but one of text or html optional):
 177                         * "author" - human-readable name of the comment author
 178                         * "author_id" - user ID of the comment author
 179                         * "id" - Comment ID
 180                         * "html" - Comment as HTML
 181                         * "text" - Plain text of the comment
 182                         * "timestamp" - UNIX timestamp of comment
 183                         * "parent" - ID of the comment this one is replying to.
 184                                      Set to "root" to indicate that this is a
 185                                      comment to the original video.
 186     age_limit:      Age restriction for the video, as an integer (years)
 187     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 188                     should allow to get the same result again. (It will be set
 189                     by YoutubeDL if it's missing)
 190     categories:     A list of categories that the video falls in, for example
 191                     ["Sports", "Berlin"]
 192     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 193     is_live:        True, False, or None (=unknown). Whether this video is a
 194                     live stream that goes on instead of a fixed-length video.
 195     start_time:     Time in seconds where the reproduction should start, as
 196                     specified in the URL.
 197     end_time:       Time in seconds where the reproduction should end, as
 198                     specified in the URL.
 199
 200     Unless mentioned otherwise, the fields should be Unicode strings.
 201
 202     Unless mentioned otherwise, None is equivalent to absence of information.
 203
 204
 205     _type "playlist" indicates multiple videos.
 206     There must be a key "entries", which is a list, an iterable, or a PagedList
 207     object, each element of which is a valid dictionary by this specification.
 208
 209     Additionally, playlists can have "title", "description" and "id" attributes
 210     with the same semantics as videos (see above).
 211
 212
 213     _type "multi_video" indicates that there are multiple videos that
 214     form a single show, for examples multiple acts of an opera or TV episode.
 215     It must have an entries key like a playlist and contain all the keys
 216     required for a video at the same time.
 217
 218
 219     _type "url" indicates that the video must be extracted from another
 220     location, possibly by a different extractor. Its only required key is:
 221     "url" - the next URL to extract.
 222     The key "ie_key" can be set to the class name (minus the trailing "IE",
 223     e.g. "Youtube") if the extractor class is known in advance.
 224     Additionally, the dictionary may have any properties of the resolved entity
 225     known in advance, for example "title" if the title of the referred video is
 226     known ahead of time.
 227
 228
 229     _type "url_transparent" entities have the same specification as "url", but
 230     indicate that the given additional information is more precise than the one
 231     associated with the resolved URL.
 232     This is useful when a site employs a video service that hosts the video and
 233     its technical metadata, but that video service does not embed a useful
 234     title, description etc.
 235
 236
 237     Subclasses of this one should re-define the _real_initialize() and
 238     _real_extract() methods and define a _VALID_URL regexp.
 239     Probably, they should also be added to the list of extractors.
 240
 241     Finally, the _WORKING attribute should be set to False for broken IEs
 242     in order to warn the users and skip the tests.
 243     """
 244
 245     _ready = False
 246     _downloader = None
 247     _WORKING = True
 248
 249     def __init__(self, downloader=None):
 250         """Constructor. Receives an optional downloader."""
 251         self._ready = False
 252         self.set_downloader(downloader)
 253
 254     @classmethod
 255     def suitable(cls, url):
 256         """Receives a URL and returns True if suitable for this IE."""
 257
 258         # This does not use has/getattr intentionally - we want to know whether
 259         # we have cached the regexp for *this* class, whereas getattr would also
 260         # match the superclass
 261         if '_VALID_URL_RE' not in cls.__dict__:
 262             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 263         return cls._VALID_URL_RE.match(url) is not None
 264
 265     @classmethod
 266     def _match_id(cls, url):
 267         if '_VALID_URL_RE' not in cls.__dict__:
 268             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 269         m = cls._VALID_URL_RE.match(url)
 270         assert m
 271         return m.group('id')
 272
 273     @classmethod
 274     def working(cls):
 275         """Getter method for _WORKING."""
 276         return cls._WORKING
 277
 278     def initialize(self):
 279         """Initializes an instance (authentication, etc)."""
 280         if not self._ready:
 281             self._real_initialize()
 282             self._ready = True
 283
 284     def extract(self, url):
 285         """Extracts URL information and returns it in list of dicts."""
 286         try:
 287             self.initialize()
 288             return self._real_extract(url)
 289         except ExtractorError:
 290             raise
 291         except compat_http_client.IncompleteRead as e:
 292             raise ExtractorError('A network error has occured.', cause=e, expected=True)
 293         except (KeyError, StopIteration) as e:
 294             raise ExtractorError('An extractor error has occured.', cause=e)
 295
 296     def set_downloader(self, downloader):
 297         """Sets the downloader for this IE."""
 298         self._downloader = downloader
 299
 300     def _real_initialize(self):
 301         """Real initialization process. Redefine in subclasses."""
 302         pass
 303
 304     def _real_extract(self, url):
 305         """Real extraction process. Redefine in subclasses."""
 306         pass
 307
 308     @classmethod
 309     def ie_key(cls):
 310         """A string for getting the InfoExtractor with get_info_extractor"""
 311         return cls.__name__[:-2]
 312
 313     @property
 314     def IE_NAME(self):
 315         return type(self).__name__[:-2]
 316
 317     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 318         """ Returns the response handle """
 319         if note is None:
 320             self.report_download_webpage(video_id)
 321         elif note is not False:
 322             if video_id is None:
 323                 self.to_screen('%s' % (note,))
 324             else:
 325                 self.to_screen('%s: %s' % (video_id, note))
 326         try:
 327             return self._downloader.urlopen(url_or_request)
 328         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 329             if errnote is False:
 330                 return False
 331             if errnote is None:
 332                 errnote = 'Unable to download webpage'
 333             errmsg = '%s: %s' % (errnote, compat_str(err))
 334             if fatal:
 335                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 336             else:
 337                 self._downloader.report_warning(errmsg)
 338                 return False
 339
 340     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 341         """ Returns a tuple (page content as string, URL handle) """
 342         # Strip hashes from the URL (#1038)
 343         if isinstance(url_or_request, (compat_str, str)):
 344             url_or_request = url_or_request.partition('#')[0]
 345
 346         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 347         if urlh is False:
 348             assert not fatal
 349             return False
 350         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 351         return (content, urlh)
 352
 353     @staticmethod
 354     def _guess_encoding_from_content(content_type, webpage_bytes):
 355         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 356         if m:
 357             encoding = m.group(1)
 358         else:
 359             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 360                           webpage_bytes[:1024])
 361             if m:
 362                 encoding = m.group(1).decode('ascii')
 363             elif webpage_bytes.startswith(b'\xff\xfe'):
 364                 encoding = 'utf-16'
 365             else:
 366                 encoding = 'utf-8'
 367
 368         return encoding
 369
 370     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 371         content_type = urlh.headers.get('Content-Type', '')
 372         webpage_bytes = urlh.read()
 373         if prefix is not None:
 374             webpage_bytes = prefix + webpage_bytes
 375         if not encoding:
 376             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 377         if self._downloader.params.get('dump_intermediate_pages', False):
 378             try:
 379                 url = url_or_request.get_full_url()
 380             except AttributeError:
 381                 url = url_or_request
 382             self.to_screen('Dumping request to ' + url)
 383             dump = base64.b64encode(webpage_bytes).decode('ascii')
 384             self._downloader.to_screen(dump)
 385         if self._downloader.params.get('write_pages', False):
 386             try:
 387                 url = url_or_request.get_full_url()
 388             except AttributeError:
 389                 url = url_or_request
 390             basen = '%s_%s' % (video_id, url)
 391             if len(basen) > 240:
 392                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 393                 basen = basen[:240 - len(h)] + h
 394             raw_filename = basen + '.dump'
 395             filename = sanitize_filename(raw_filename, restricted=True)
 396             self.to_screen('Saving request to ' + filename)
 397             # Working around MAX_PATH limitation on Windows (see
 398             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 399             if os.name == 'nt':
 400                 absfilepath = os.path.abspath(filename)
 401                 if len(absfilepath) > 259:
 402                     filename = '\\\\?\\' + absfilepath
 403             with open(filename, 'wb') as outf:
 404                 outf.write(webpage_bytes)
 405
 406         try:
 407             content = webpage_bytes.decode(encoding, 'replace')
 408         except LookupError:
 409             content = webpage_bytes.decode('utf-8', 'replace')
 410
 411         if ('<title>Access to this site is blocked</title>' in content and
 412                 'Websense' in content[:512]):
 413             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 414             blocked_iframe = self._html_search_regex(
 415                 r'<iframe src="([^"]+)"', content,
 416                 'Websense information URL', default=None)
 417             if blocked_iframe:
 418                 msg += ' Visit %s for more details' % blocked_iframe
 419             raise ExtractorError(msg, expected=True)
 420         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 421             msg = (
 422                 'Access to this webpage has been blocked by Indian censorship. '
 423                 'Use a VPN or proxy server (with --proxy) to route around it.')
 424             block_msg = self._html_search_regex(
 425                 r'</h1><p>(.*?)</p>',
 426                 content, 'block message', default=None)
 427             if block_msg:
 428                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 429             raise ExtractorError(msg, expected=True)
 430
 431         return content
 432
 433     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 434         """ Returns the data of the page as a string """
 435         success = False
 436         try_count = 0
 437         while success is False:
 438             try:
 439                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 440                 success = True
 441             except compat_http_client.IncompleteRead as e:
 442                 try_count += 1
 443                 if try_count >= tries:
 444                     raise e
 445                 self._sleep(timeout, video_id)
 446         if res is False:
 447             return res
 448         else:
 449             content, _ = res
 450             return content
 451
 452     def _download_xml(self, url_or_request, video_id,
 453                       note='Downloading XML', errnote='Unable to download XML',
 454                       transform_source=None, fatal=True, encoding=None):
 455         """Return the xml as an xml.etree.ElementTree.Element"""
 456         xml_string = self._download_webpage(
 457             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 458         if xml_string is False:
 459             return xml_string
 460         if transform_source:
 461             xml_string = transform_source(xml_string)
 462         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 463
 464     def _download_json(self, url_or_request, video_id,
 465                        note='Downloading JSON metadata',
 466                        errnote='Unable to download JSON metadata',
 467                        transform_source=None,
 468                        fatal=True, encoding=None):
 469         json_string = self._download_webpage(
 470             url_or_request, video_id, note, errnote, fatal=fatal,
 471             encoding=encoding)
 472         if (not fatal) and json_string is False:
 473             return None
 474         return self._parse_json(
 475             json_string, video_id, transform_source=transform_source, fatal=fatal)
 476
 477     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 478         if transform_source:
 479             json_string = transform_source(json_string)
 480         try:
 481             return json.loads(json_string)
 482         except ValueError as ve:
 483             errmsg = '%s: Failed to parse JSON ' % video_id
 484             if fatal:
 485                 raise ExtractorError(errmsg, cause=ve)
 486             else:
 487                 self.report_warning(errmsg + str(ve))
 488
 489     def report_warning(self, msg, video_id=None):
 490         idstr = '' if video_id is None else '%s: ' % video_id
 491         self._downloader.report_warning(
 492             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 493
 494     def to_screen(self, msg):
 495         """Print msg to screen, prefixing it with '[ie_name]'"""
 496         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 497
 498     def report_extraction(self, id_or_name):
 499         """Report information extraction."""
 500         self.to_screen('%s: Extracting information' % id_or_name)
 501
 502     def report_download_webpage(self, video_id):
 503         """Report webpage download."""
 504         self.to_screen('%s: Downloading webpage' % video_id)
 505
 506     def report_age_confirmation(self):
 507         """Report attempt to confirm age."""
 508         self.to_screen('Confirming age')
 509
 510     def report_login(self):
 511         """Report attempt to log in."""
 512         self.to_screen('Logging in')
 513
 514     @staticmethod
 515     def raise_login_required(msg='This video is only available for registered users'):
 516         raise ExtractorError(
 517             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 518             expected=True)
 519
 520     @staticmethod
 521     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 522         raise ExtractorError(
 523             '%s. You might want to use --proxy to workaround.' % msg,
 524             expected=True)
 525
 526     # Methods for following #608
 527     @staticmethod
 528     def url_result(url, ie=None, video_id=None, video_title=None):
 529         """Returns a URL that points to a page that should be processed"""
 530         # TODO: ie should be the class used for getting the info
 531         video_info = {'_type': 'url',
 532                       'url': url,
 533                       'ie_key': ie}
 534         if video_id is not None:
 535             video_info['id'] = video_id
 536         if video_title is not None:
 537             video_info['title'] = video_title
 538         return video_info
 539
 540     @staticmethod
 541     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 542         """Returns a playlist"""
 543         video_info = {'_type': 'playlist',
 544                       'entries': entries}
 545         if playlist_id:
 546             video_info['id'] = playlist_id
 547         if playlist_title:
 548             video_info['title'] = playlist_title
 549         if playlist_description:
 550             video_info['description'] = playlist_description
 551         return video_info
 552
 553     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 554         """
 555         Perform a regex search on the given string, using a single or a list of
 556         patterns returning the first matching group.
 557         In case of failure return a default value or raise a WARNING or a
 558         RegexNotFoundError, depending on fatal, specifying the field name.
 559         """
 560         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 561             mobj = re.search(pattern, string, flags)
 562         else:
 563             for p in pattern:
 564                 mobj = re.search(p, string, flags)
 565                 if mobj:
 566                     break
 567
 568         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 569             _name = '\033[0;34m%s\033[0m' % name
 570         else:
 571             _name = name
 572
 573         if mobj:
 574             if group is None:
 575                 # return the first matching group
 576                 return next(g for g in mobj.groups() if g is not None)
 577             else:
 578                 return mobj.group(group)
 579         elif default is not NO_DEFAULT:
 580             return default
 581         elif fatal:
 582             raise RegexNotFoundError('Unable to extract %s' % _name)
 583         else:
 584             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 585             return None
 586
 587     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 588         """
 589         Like _search_regex, but strips HTML tags and unescapes entities.
 590         """
 591         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 592         if res:
 593             return clean_html(res).strip()
 594         else:
 595             return res
 596
 597     def _get_login_info(self):
 598         """
 599         Get the login info as (username, password)
 600         It will look in the netrc file using the _NETRC_MACHINE value
 601         If there's no info available, return (None, None)
 602         """
 603         if self._downloader is None:
 604             return (None, None)
 605
 606         username = None
 607         password = None
 608         downloader_params = self._downloader.params
 609
 610         # Attempt to use provided username and password or .netrc data
 611         if downloader_params.get('username', None) is not None:
 612             username = downloader_params['username']
 613             password = downloader_params['password']
 614         elif downloader_params.get('usenetrc', False):
 615             try:
 616                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 617                 if info is not None:
 618                     username = info[0]
 619                     password = info[2]
 620                 else:
 621                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 622             except (IOError, netrc.NetrcParseError) as err:
 623                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 624
 625         return (username, password)
 626
 627     def _get_tfa_info(self, note='two-factor verification code'):
 628         """
 629         Get the two-factor authentication info
 630         TODO - asking the user will be required for sms/phone verify
 631         currently just uses the command line option
 632         If there's no info available, return None
 633         """
 634         if self._downloader is None:
 635             return None
 636         downloader_params = self._downloader.params
 637
 638         if downloader_params.get('twofactor', None) is not None:
 639             return downloader_params['twofactor']
 640
 641         return compat_getpass('Type %s and press [Return]: ' % note)
 642
 643     # Helper functions for extracting OpenGraph info
 644     @staticmethod
 645     def _og_regexes(prop):
 646         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 647         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 648         template = r'<meta[^>]+?%s[^>]+?%s'
 649         return [
 650             template % (property_re, content_re),
 651             template % (content_re, property_re),
 652         ]
 653
 654     @staticmethod
 655     def _meta_regex(prop):
 656         return r'''(?isx)<meta
 657                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 658                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 659
 660     def _og_search_property(self, prop, html, name=None, **kargs):
 661         if name is None:
 662             name = 'OpenGraph %s' % prop
 663         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 664         if escaped is None:
 665             return None
 666         return unescapeHTML(escaped)
 667
 668     def _og_search_thumbnail(self, html, **kargs):
 669         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 670
 671     def _og_search_description(self, html, **kargs):
 672         return self._og_search_property('description', html, fatal=False, **kargs)
 673
 674     def _og_search_title(self, html, **kargs):
 675         return self._og_search_property('title', html, **kargs)
 676
 677     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 678         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 679         if secure:
 680             regexes = self._og_regexes('video:secure_url') + regexes
 681         return self._html_search_regex(regexes, html, name, **kargs)
 682
 683     def _og_search_url(self, html, **kargs):
 684         return self._og_search_property('url', html, **kargs)
 685
 686     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 687         if display_name is None:
 688             display_name = name
 689         return self._html_search_regex(
 690             self._meta_regex(name),
 691             html, display_name, fatal=fatal, group='content', **kwargs)
 692
 693     def _dc_search_uploader(self, html):
 694         return self._html_search_meta('dc.creator', html, 'uploader')
 695
 696     def _rta_search(self, html):
 697         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 698         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 699                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 700                      html):
 701             return 18
 702         return 0
 703
 704     def _media_rating_search(self, html):
 705         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 706         rating = self._html_search_meta('rating', html)
 707
 708         if not rating:
 709             return None
 710
 711         RATING_TABLE = {
 712             'safe for kids': 0,
 713             'general': 8,
 714             '14 years': 14,
 715             'mature': 17,
 716             'restricted': 19,
 717         }
 718         return RATING_TABLE.get(rating.lower(), None)
 719
 720     def _family_friendly_search(self, html):
 721         # See http://schema.org/VideoObject
 722         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 723
 724         if not family_friendly:
 725             return None
 726
 727         RATING_TABLE = {
 728             '1': 0,
 729             'true': 0,
 730             '0': 18,
 731             'false': 18,
 732         }
 733         return RATING_TABLE.get(family_friendly.lower(), None)
 734
 735     def _twitter_search_player(self, html):
 736         return self._html_search_meta('twitter:player', html,
 737                                       'twitter card player')
 738
 739     @staticmethod
 740     def _hidden_inputs(html):
 741         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 742         hidden_inputs = {}
 743         for input in re.findall(r'(?i)<input([^>]+)>', html):
 744             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 745                 continue
 746             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 747             if not name:
 748                 continue
 749             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 750             if not value:
 751                 continue
 752             hidden_inputs[name.group('value')] = value.group('value')
 753         return hidden_inputs
 754
 755     def _form_hidden_inputs(self, form_id, html):
 756         form = self._search_regex(
 757             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 758             html, '%s form' % form_id, group='form')
 759         return self._hidden_inputs(form)
 760
 761     def _sort_formats(self, formats, field_preference=None):
 762         if not formats:
 763             raise ExtractorError('No video formats found')
 764
 765         def _formats_key(f):
 766             # TODO remove the following workaround
 767             from ..utils import determine_ext
 768             if not f.get('ext') and 'url' in f:
 769                 f['ext'] = determine_ext(f['url'])
 770
 771             if isinstance(field_preference, (list, tuple)):
 772                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 773
 774             preference = f.get('preference')
 775             if preference is None:
 776                 proto = f.get('protocol')
 777                 if proto is None:
 778                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 779
 780                 preference = 0 if proto in ['http', 'https'] else -0.1
 781                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 782                     preference -= 0.5
 783
 784             if f.get('vcodec') == 'none':  # audio only
 785                 if self._downloader.params.get('prefer_free_formats'):
 786                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 787                 else:
 788                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 789                 ext_preference = 0
 790                 try:
 791                     audio_ext_preference = ORDER.index(f['ext'])
 792                 except ValueError:
 793                     audio_ext_preference = -1
 794             else:
 795                 if self._downloader.params.get('prefer_free_formats'):
 796                     ORDER = ['flv', 'mp4', 'webm']
 797                 else:
 798                     ORDER = ['webm', 'flv', 'mp4']
 799                 try:
 800                     ext_preference = ORDER.index(f['ext'])
 801                 except ValueError:
 802                     ext_preference = -1
 803                 audio_ext_preference = 0
 804
 805             return (
 806                 preference,
 807                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 808                 f.get('quality') if f.get('quality') is not None else -1,
 809                 f.get('tbr') if f.get('tbr') is not None else -1,
 810                 f.get('filesize') if f.get('filesize') is not None else -1,
 811                 f.get('vbr') if f.get('vbr') is not None else -1,
 812                 f.get('height') if f.get('height') is not None else -1,
 813                 f.get('width') if f.get('width') is not None else -1,
 814                 ext_preference,
 815                 f.get('abr') if f.get('abr') is not None else -1,
 816                 audio_ext_preference,
 817                 f.get('fps') if f.get('fps') is not None else -1,
 818                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 819                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 820                 f.get('format_id') if f.get('format_id') is not None else '',
 821             )
 822         formats.sort(key=_formats_key)
 823
 824     def _check_formats(self, formats, video_id):
 825         if formats:
 826             formats[:] = filter(
 827                 lambda f: self._is_valid_url(
 828                     f['url'], video_id,
 829                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 830                 formats)
 831
 832     def _is_valid_url(self, url, video_id, item='video'):
 833         url = self._proto_relative_url(url, scheme='http:')
 834         # For now assume non HTTP(S) URLs always valid
 835         if not (url.startswith('http://') or url.startswith('https://')):
 836             return True
 837         try:
 838             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 839             return True
 840         except ExtractorError as e:
 841             if isinstance(e.cause, compat_HTTPError):
 842                 self.to_screen(
 843                     '%s: %s URL is invalid, skipping' % (video_id, item))
 844                 return False
 845             raise
 846
 847     def http_scheme(self):
 848         """ Either "http:" or "https:", depending on the user's preferences """
 849         return (
 850             'http:'
 851             if self._downloader.params.get('prefer_insecure', False)
 852             else 'https:')
 853
 854     def _proto_relative_url(self, url, scheme=None):
 855         if url is None:
 856             return url
 857         if url.startswith('//'):
 858             if scheme is None:
 859                 scheme = self.http_scheme()
 860             return scheme + url
 861         else:
 862             return url
 863
 864     def _sleep(self, timeout, video_id, msg_template=None):
 865         if msg_template is None:
 866             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 867         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 868         self.to_screen(msg)
 869         time.sleep(timeout)
 870
 871     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 872                              transform_source=lambda s: fix_xml_ampersands(s).strip()):
 873         manifest = self._download_xml(
 874             manifest_url, video_id, 'Downloading f4m manifest',
 875             'Unable to download f4m manifest',
 876             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 877             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 878             transform_source=transform_source)
 879
 880         formats = []
 881         manifest_version = '1.0'
 882         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 883         if not media_nodes:
 884             manifest_version = '2.0'
 885             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 886         for i, media_el in enumerate(media_nodes):
 887             if manifest_version == '2.0':
 888                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 889                 if not media_url:
 890                     continue
 891                 manifest_url = (
 892                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 893                     else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
 894                 # If media_url is itself a f4m manifest do the recursive extraction
 895                 # since bitrates in parent manifest (this one) and media_url manifest
 896                 # may differ leading to inability to resolve the format by requested
 897                 # bitrate in f4m downloader
 898                 if determine_ext(manifest_url) == 'f4m':
 899                     formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
 900                     continue
 901             tbr = int_or_none(media_el.attrib.get('bitrate'))
 902             formats.append({
 903                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 904                 'url': manifest_url,
 905                 'ext': 'flv',
 906                 'tbr': tbr,
 907                 'width': int_or_none(media_el.attrib.get('width')),
 908                 'height': int_or_none(media_el.attrib.get('height')),
 909                 'preference': preference,
 910             })
 911         self._sort_formats(formats)
 912
 913         return formats
 914
 915     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 916                               entry_protocol='m3u8', preference=None,
 917                               m3u8_id=None, note=None, errnote=None,
 918                               fatal=True):
 919
 920         formats = [{
 921             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 922             'url': m3u8_url,
 923             'ext': ext,
 924             'protocol': 'm3u8',
 925             'preference': preference - 1 if preference else -1,
 926             'resolution': 'multiple',
 927             'format_note': 'Quality selection URL',
 928         }]
 929
 930         format_url = lambda u: (
 931             u
 932             if re.match(r'^https?://', u)
 933             else compat_urlparse.urljoin(m3u8_url, u))
 934
 935         m3u8_doc = self._download_webpage(
 936             m3u8_url, video_id,
 937             note=note or 'Downloading m3u8 information',
 938             errnote=errnote or 'Failed to download m3u8 information',
 939             fatal=fatal)
 940         if m3u8_doc is False:
 941             return m3u8_doc
 942         last_info = None
 943         last_media = None
 944         kv_rex = re.compile(
 945             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 946         for line in m3u8_doc.splitlines():
 947             if line.startswith('#EXT-X-STREAM-INF:'):
 948                 last_info = {}
 949                 for m in kv_rex.finditer(line):
 950                     v = m.group('val')
 951                     if v.startswith('"'):
 952                         v = v[1:-1]
 953                     last_info[m.group('key')] = v
 954             elif line.startswith('#EXT-X-MEDIA:'):
 955                 last_media = {}
 956                 for m in kv_rex.finditer(line):
 957                     v = m.group('val')
 958                     if v.startswith('"'):
 959                         v = v[1:-1]
 960                     last_media[m.group('key')] = v
 961             elif line.startswith('#') or not line.strip():
 962                 continue
 963             else:
 964                 if last_info is None:
 965                     formats.append({'url': format_url(line)})
 966                     continue
 967                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 968                 format_id = []
 969                 if m3u8_id:
 970                     format_id.append(m3u8_id)
 971                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
 972                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
 973                 f = {
 974                     'format_id': '-'.join(format_id),
 975                     'url': format_url(line.strip()),
 976                     'tbr': tbr,
 977                     'ext': ext,
 978                     'protocol': entry_protocol,
 979                     'preference': preference,
 980                 }
 981                 codecs = last_info.get('CODECS')
 982                 if codecs:
 983                     # TODO: looks like video codec is not always necessarily goes first
 984                     va_codecs = codecs.split(',')
 985                     if va_codecs[0]:
 986                         f['vcodec'] = va_codecs[0].partition('.')[0]
 987                     if len(va_codecs) > 1 and va_codecs[1]:
 988                         f['acodec'] = va_codecs[1].partition('.')[0]
 989                 resolution = last_info.get('RESOLUTION')
 990                 if resolution:
 991                     width_str, height_str = resolution.split('x')
 992                     f['width'] = int(width_str)
 993                     f['height'] = int(height_str)
 994                 if last_media is not None:
 995                     f['m3u8_media'] = last_media
 996                     last_media = None
 997                 formats.append(f)
 998                 last_info = {}
 999         self._sort_formats(formats)
1000         return formats
1001
1002     @staticmethod
1003     def _xpath_ns(path, namespace=None):
1004         if not namespace:
1005             return path
1006         out = []
1007         for c in path.split('/'):
1008             if not c or c == '.':
1009                 out.append(c)
1010             else:
1011                 out.append('{%s}%s' % (namespace, c))
1012         return '/'.join(out)
1013
1014     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1015         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1016
1017         if smil is False:
1018             assert not fatal
1019             return []
1020
1021         namespace = self._parse_smil_namespace(smil)
1022
1023         return self._parse_smil_formats(
1024             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1025
1026     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1027         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1028         if smil is False:
1029             return {}
1030         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1031
1032     def _download_smil(self, smil_url, video_id, fatal=True):
1033         return self._download_xml(
1034             smil_url, video_id, 'Downloading SMIL file',
1035             'Unable to download SMIL file', fatal=fatal)
1036
1037     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1038         namespace = self._parse_smil_namespace(smil)
1039
1040         formats = self._parse_smil_formats(
1041             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1042         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1043
1044         video_id = os.path.splitext(url_basename(smil_url))[0]
1045         title = None
1046         description = None
1047         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1048             name = meta.attrib.get('name')
1049             content = meta.attrib.get('content')
1050             if not name or not content:
1051                 continue
1052             if not title and name == 'title':
1053                 title = content
1054             elif not description and name in ('description', 'abstract'):
1055                 description = content
1056
1057         return {
1058             'id': video_id,
1059             'title': title or video_id,
1060             'description': description,
1061             'formats': formats,
1062             'subtitles': subtitles,
1063         }
1064
1065     def _parse_smil_namespace(self, smil):
1066         return self._search_regex(
1067             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1068
1069     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1070         base = smil_url
1071         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1072             b = meta.get('base') or meta.get('httpBase')
1073             if b:
1074                 base = b
1075                 break
1076
1077         formats = []
1078         rtmp_count = 0
1079         http_count = 0
1080
1081         videos = smil.findall(self._xpath_ns('.//video', namespace))
1082         for video in videos:
1083             src = video.get('src')
1084             if not src:
1085                 continue
1086
1087             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1088             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1089             width = int_or_none(video.get('width'))
1090             height = int_or_none(video.get('height'))
1091             proto = video.get('proto')
1092             ext = video.get('ext')
1093             src_ext = determine_ext(src)
1094             streamer = video.get('streamer') or base
1095
1096             if proto == 'rtmp' or streamer.startswith('rtmp'):
1097                 rtmp_count += 1
1098                 formats.append({
1099                     'url': streamer,
1100                     'play_path': src,
1101                     'ext': 'flv',
1102                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1103                     'tbr': bitrate,
1104                     'filesize': filesize,
1105                     'width': width,
1106                     'height': height,
1107                 })
1108                 if transform_rtmp_url:
1109                     streamer, src = transform_rtmp_url(streamer, src)
1110                     formats[-1].update({
1111                         'url': streamer,
1112                         'play_path': src,
1113                     })
1114                 continue
1115
1116             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1117
1118             if proto == 'm3u8' or src_ext == 'm3u8':
1119                 formats.extend(self._extract_m3u8_formats(
1120                     src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1121                 continue
1122
1123             if src_ext == 'f4m':
1124                 f4m_url = src_url
1125                 if not f4m_params:
1126                     f4m_params = {
1127                         'hdcore': '3.2.0',
1128                         'plugin': 'flowplayer-3.2.0.1',
1129                     }
1130                 f4m_url += '&' if '?' in f4m_url else '?'
1131                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1132                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1133                 continue
1134
1135             if src_url.startswith('http'):
1136                 http_count += 1
1137                 formats.append({
1138                     'url': src_url,
1139                     'ext': ext or src_ext or 'flv',
1140                     'format_id': 'http-%d' % (bitrate or http_count),
1141                     'tbr': bitrate,
1142                     'filesize': filesize,
1143                     'width': width,
1144                     'height': height,
1145                 })
1146                 continue
1147
1148         self._sort_formats(formats)
1149
1150         return formats
1151
1152     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1153         subtitles = {}
1154         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1155             src = textstream.get('src')
1156             if not src:
1157                 continue
1158             ext = textstream.get('ext') or determine_ext(src)
1159             if not ext:
1160                 type_ = textstream.get('type')
1161                 SUBTITLES_TYPES = {
1162                     'text/vtt': 'vtt',
1163                     'text/srt': 'srt',
1164                     'application/smptett+xml': 'tt',
1165                 }
1166                 if type_ in SUBTITLES_TYPES:
1167                     ext = SUBTITLES_TYPES[type_]
1168             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1169             subtitles.setdefault(lang, []).append({
1170                 'url': src,
1171                 'ext': ext,
1172             })
1173         return subtitles
1174
1175     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1176         xspf = self._download_xml(
1177             playlist_url, playlist_id, 'Downloading xpsf playlist',
1178             'Unable to download xspf manifest', fatal=fatal)
1179         if xspf is False:
1180             return []
1181         return self._parse_xspf(xspf, playlist_id)
1182
1183     def _parse_xspf(self, playlist, playlist_id):
1184         NS_MAP = {
1185             'xspf': 'http://xspf.org/ns/0/',
1186             's1': 'http://static.streamone.nl/player/ns/0',
1187         }
1188
1189         entries = []
1190         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1191             title = xpath_text(
1192                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1193             description = xpath_text(
1194                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1195             thumbnail = xpath_text(
1196                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1197             duration = float_or_none(
1198                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1199
1200             formats = [{
1201                 'url': location.text,
1202                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1203                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1204                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1205             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1206             self._sort_formats(formats)
1207
1208             entries.append({
1209                 'id': playlist_id,
1210                 'title': title,
1211                 'description': description,
1212                 'thumbnail': thumbnail,
1213                 'duration': duration,
1214                 'formats': formats,
1215             })
1216         return entries
1217
1218     def _live_title(self, name):
1219         """ Generate the title for a live video """
1220         now = datetime.datetime.now()
1221         now_str = now.strftime("%Y-%m-%d %H:%M")
1222         return name + ' ' + now_str
1223
1224     def _int(self, v, name, fatal=False, **kwargs):
1225         res = int_or_none(v, **kwargs)
1226         if 'get_attr' in kwargs:
1227             print(getattr(v, kwargs['get_attr']))
1228         if res is None:
1229             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1230             if fatal:
1231                 raise ExtractorError(msg)
1232             else:
1233                 self._downloader.report_warning(msg)
1234         return res
1235
1236     def _float(self, v, name, fatal=False, **kwargs):
1237         res = float_or_none(v, **kwargs)
1238         if res is None:
1239             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1240             if fatal:
1241                 raise ExtractorError(msg)
1242             else:
1243                 self._downloader.report_warning(msg)
1244         return res
1245
1246     def _set_cookie(self, domain, name, value, expire_time=None):
1247         cookie = compat_cookiejar.Cookie(
1248             0, name, value, None, None, domain, None,
1249             None, '/', True, False, expire_time, '', None, None, None)
1250         self._downloader.cookiejar.set_cookie(cookie)
1251
1252     def _get_cookies(self, url):
1253         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1254         req = compat_urllib_request.Request(url)
1255         self._downloader.cookiejar.add_cookie_header(req)
1256         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1257
1258     def get_testcases(self, include_onlymatching=False):
1259         t = getattr(self, '_TEST', None)
1260         if t:
1261             assert not hasattr(self, '_TESTS'), \
1262                 '%s has _TEST and _TESTS' % type(self).__name__
1263             tests = [t]
1264         else:
1265             tests = getattr(self, '_TESTS', [])
1266         for t in tests:
1267             if not include_onlymatching and t.get('only_matching', False):
1268                 continue
1269             t['name'] = type(self).__name__[:-len('IE')]
1270             yield t
1271
1272     def is_suitable(self, age_limit):
1273         """ Test whether the extractor is generally suitable for the given
1274         age limit (i.e. pornographic sites are not, all others usually are) """
1275
1276         any_restricted = False
1277         for tc in self.get_testcases(include_onlymatching=False):
1278             if 'playlist' in tc:
1279                 tc = tc['playlist'][0]
1280             is_restricted = age_restricted(
1281                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1282             if not is_restricted:
1283                 return True
1284             any_restricted = any_restricted or is_restricted
1285         return not any_restricted
1286
1287     def extract_subtitles(self, *args, **kwargs):
1288         if (self._downloader.params.get('writesubtitles', False) or
1289                 self._downloader.params.get('listsubtitles')):
1290             return self._get_subtitles(*args, **kwargs)
1291         return {}
1292
1293     def _get_subtitles(self, *args, **kwargs):
1294         raise NotImplementedError("This method must be implemented by subclasses")
1295
1296     @staticmethod
1297     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1298         """ Merge subtitle items for one language. Items with duplicated URLs
1299         will be dropped. """
1300         list1_urls = set([item['url'] for item in subtitle_list1])
1301         ret = list(subtitle_list1)
1302         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1303         return ret
1304
1305     @classmethod
1306     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1307         """ Merge two subtitle dictionaries, language by language. """
1308         ret = dict(subtitle_dict1)
1309         for lang in subtitle_dict2:
1310             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1311         return ret
1312
1313     def extract_automatic_captions(self, *args, **kwargs):
1314         if (self._downloader.params.get('writeautomaticsub', False) or
1315                 self._downloader.params.get('listsubtitles')):
1316             return self._get_automatic_captions(*args, **kwargs)
1317         return {}
1318
1319     def _get_automatic_captions(self, *args, **kwargs):
1320         raise NotImplementedError("This method must be implemented by subclasses")
1321
1322
1323 class SearchInfoExtractor(InfoExtractor):
1324     """
1325     Base class for paged search queries extractors.
1326     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1327     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1328     """
1329
1330     @classmethod
1331     def _make_valid_url(cls):
1332         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1333
1334     @classmethod
1335     def suitable(cls, url):
1336         return re.match(cls._make_valid_url(), url) is not None
1337
1338     def _real_extract(self, query):
1339         mobj = re.match(self._make_valid_url(), query)
1340         if mobj is None:
1341             raise ExtractorError('Invalid search query "%s"' % query)
1342
1343         prefix = mobj.group('prefix')
1344         query = mobj.group('query')
1345         if prefix == '':
1346             return self._get_n_results(query, 1)
1347         elif prefix == 'all':
1348             return self._get_n_results(query, self._MAX_RESULTS)
1349         else:
1350             n = int(prefix)
1351             if n <= 0:
1352                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1353             elif n > self._MAX_RESULTS:
1354                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1355                 n = self._MAX_RESULTS
1356             return self._get_n_results(query, n)
1357
1358     def _get_n_results(self, query, n):
1359         """Get a specified number of results for a query"""
1360         raise NotImplementedError("This method must be implemented by subclasses")
1361
1362     @property
1363     def SEARCH_KEY(self):
1364         return self._SEARCH_KEY