_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import hashlib
   5 import json
   6 import netrc
   7 import os
   8 import re
   9 import socket
  10 import sys
  11 import time
  12 import xml.etree.ElementTree
  13
  14 from ..utils import (
  15     compat_http_client,
  16     compat_urllib_error,
  17     compat_urllib_parse_urlparse,
  18     compat_str,
  19
  20     clean_html,
  21     compiled_regex_type,
  22     ExtractorError,
  23     int_or_none,
  24     RegexNotFoundError,
  25     sanitize_filename,
  26     unescapeHTML,
  27 )
  28 _NO_DEFAULT = object()
  29
  30
  31 class InfoExtractor(object):
  32     """Information Extractor class.
  33
  34     Information extractors are the classes that, given a URL, extract
  35     information about the video (or videos) the URL refers to. This
  36     information includes the real video URL, the video title, author and
  37     others. The information is stored in a dictionary which is then
  38     passed to the FileDownloader. The FileDownloader processes this
  39     information possibly downloading the video to the file system, among
  40     other possible outcomes.
  41
  42     The dictionaries must include the following fields:
  43
  44     id:             Video identifier.
  45     title:          Video title, unescaped.
  46
  47     Additionally, it must contain either a formats entry or a url one:
  48
  49     formats:        A list of dictionaries for each format available, ordered
  50                     from worst to best quality.
  51
  52                     Potential fields:
  53                     * url        Mandatory. The URL of the video file
  54                     * ext        Will be calculated from url if missing
  55                     * format     A human-readable description of the format
  56                                  ("mp4 container with h264/opus").
  57                                  Calculated from the format_id, width, height.
  58                                  and format_note fields if missing.
  59                     * format_id  A short description of the format
  60                                  ("mp4_h264_opus" or "19").
  61                                 Technically optional, but strongly recommended.
  62                     * format_note Additional info about the format
  63                                  ("3D" or "DASH video")
  64                     * width      Width of the video, if known
  65                     * height     Height of the video, if known
  66                     * resolution Textual description of width and height
  67                     * tbr        Average bitrate of audio and video in KBit/s
  68                     * abr        Average audio bitrate in KBit/s
  69                     * acodec     Name of the audio codec in use
  70                     * asr        Audio sampling rate in Hertz
  71                     * vbr        Average video bitrate in KBit/s
  72                     * vcodec     Name of the video codec in use
  73                     * container  Name of the container format
  74                     * filesize   The number of bytes, if known in advance
  75                     * filesize_approx  An estimate for the number of bytes
  76                     * player_url SWF Player URL (used for rtmpdump).
  77                     * protocol   The protocol that will be used for the actual
  78                                  download, lower-case.
  79                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  80                     * preference Order number of this format. If this field is
  81                                  present and not None, the formats get sorted
  82                                  by this field, regardless of all other values.
  83                                  -1 for default (order by other properties),
  84                                  -2 or smaller for less than default.
  85                     * quality    Order number of the video quality of this
  86                                  format, irrespective of the file format.
  87                                  -1 for default (order by other properties),
  88                                  -2 or smaller for less than default.
  89                     * http_referer  HTTP Referer header value to set.
  90                     * http_method  HTTP method to use for the download.
  91                     * http_headers  A dictionary of additional HTTP headers
  92                                  to add to the request.
  93                     * http_post_data  Additional data to send with a POST
  94                                  request.
  95     url:            Final video URL.
  96     ext:            Video filename extension.
  97     format:         The video format, defaults to ext (used for --get-format)
  98     player_url:     SWF Player URL (used for rtmpdump).
  99
 100     The following fields are optional:
 101
 102     display_id      An alternative identifier for the video, not necessarily
 103                     unique, but available before title. Typically, id is
 104                     something like "4234987", title "Dancing naked mole rats",
 105                     and display_id "dancing-naked-mole-rats"
 106     thumbnails:     A list of dictionaries, with the following entries:
 107                         * "url"
 108                         * "width" (optional, int)
 109                         * "height" (optional, int)
 110                         * "resolution" (optional, string "{width}x{height"},
 111                                         deprecated)
 112     thumbnail:      Full URL to a video thumbnail image.
 113     description:    One-line video description.
 114     uploader:       Full name of the video uploader.
 115     timestamp:      UNIX timestamp of the moment the video became available.
 116     upload_date:    Video upload date (YYYYMMDD).
 117                     If not explicitly set, calculated from timestamp.
 118     uploader_id:    Nickname or id of the video uploader.
 119     location:       Physical location where the video was filmed.
 120     subtitles:      The subtitle file contents as a dictionary in the format
 121                     {language: subtitles}.
 122     duration:       Length of the video in seconds, as an integer.
 123     view_count:     How many users have watched the video on the platform.
 124     like_count:     Number of positive ratings of the video
 125     dislike_count:  Number of negative ratings of the video
 126     comment_count:  Number of comments on the video
 127     age_limit:      Age restriction for the video, as an integer (years)
 128     webpage_url:    The url to the video webpage, if given to youtube-dl it
 129                     should allow to get the same result again. (It will be set
 130                     by YoutubeDL if it's missing)
 131     categories:     A list of categories that the video falls in, for example
 132                     ["Sports", "Berlin"]
 133
 134     Unless mentioned otherwise, the fields should be Unicode strings.
 135
 136     Subclasses of this one should re-define the _real_initialize() and
 137     _real_extract() methods and define a _VALID_URL regexp.
 138     Probably, they should also be added to the list of extractors.
 139
 140     Finally, the _WORKING attribute should be set to False for broken IEs
 141     in order to warn the users and skip the tests.
 142     """
 143
 144     _ready = False
 145     _downloader = None
 146     _WORKING = True
 147
 148     def __init__(self, downloader=None):
 149         """Constructor. Receives an optional downloader."""
 150         self._ready = False
 151         self.set_downloader(downloader)
 152
 153     @classmethod
 154     def suitable(cls, url):
 155         """Receives a URL and returns True if suitable for this IE."""
 156
 157         # This does not use has/getattr intentionally - we want to know whether
 158         # we have cached the regexp for *this* class, whereas getattr would also
 159         # match the superclass
 160         if '_VALID_URL_RE' not in cls.__dict__:
 161             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 162         return cls._VALID_URL_RE.match(url) is not None
 163
 164     @classmethod
 165     def working(cls):
 166         """Getter method for _WORKING."""
 167         return cls._WORKING
 168
 169     def initialize(self):
 170         """Initializes an instance (authentication, etc)."""
 171         if not self._ready:
 172             self._real_initialize()
 173             self._ready = True
 174
 175     def extract(self, url):
 176         """Extracts URL information and returns it in list of dicts."""
 177         self.initialize()
 178         return self._real_extract(url)
 179
 180     def set_downloader(self, downloader):
 181         """Sets the downloader for this IE."""
 182         self._downloader = downloader
 183
 184     def _real_initialize(self):
 185         """Real initialization process. Redefine in subclasses."""
 186         pass
 187
 188     def _real_extract(self, url):
 189         """Real extraction process. Redefine in subclasses."""
 190         pass
 191
 192     @classmethod
 193     def ie_key(cls):
 194         """A string for getting the InfoExtractor with get_info_extractor"""
 195         return cls.__name__[:-2]
 196
 197     @property
 198     def IE_NAME(self):
 199         return type(self).__name__[:-2]
 200
 201     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 202         """ Returns the response handle """
 203         if note is None:
 204             self.report_download_webpage(video_id)
 205         elif note is not False:
 206             if video_id is None:
 207                 self.to_screen('%s' % (note,))
 208             else:
 209                 self.to_screen('%s: %s' % (video_id, note))
 210         try:
 211             return self._downloader.urlopen(url_or_request)
 212         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 213             if errnote is False:
 214                 return False
 215             if errnote is None:
 216                 errnote = 'Unable to download webpage'
 217             errmsg = '%s: %s' % (errnote, compat_str(err))
 218             if fatal:
 219                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 220             else:
 221                 self._downloader.report_warning(errmsg)
 222                 return False
 223
 224     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 225         """ Returns a tuple (page content as string, URL handle) """
 226
 227         # Strip hashes from the URL (#1038)
 228         if isinstance(url_or_request, (compat_str, str)):
 229             url_or_request = url_or_request.partition('#')[0]
 230
 231         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 232         if urlh is False:
 233             assert not fatal
 234             return False
 235         content_type = urlh.headers.get('Content-Type', '')
 236         webpage_bytes = urlh.read()
 237         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 238         if m:
 239             encoding = m.group(1)
 240         else:
 241             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 242                           webpage_bytes[:1024])
 243             if m:
 244                 encoding = m.group(1).decode('ascii')
 245             elif webpage_bytes.startswith(b'\xff\xfe'):
 246                 encoding = 'utf-16'
 247             else:
 248                 encoding = 'utf-8'
 249         if self._downloader.params.get('dump_intermediate_pages', False):
 250             try:
 251                 url = url_or_request.get_full_url()
 252             except AttributeError:
 253                 url = url_or_request
 254             self.to_screen('Dumping request to ' + url)
 255             dump = base64.b64encode(webpage_bytes).decode('ascii')
 256             self._downloader.to_screen(dump)
 257         if self._downloader.params.get('write_pages', False):
 258             try:
 259                 url = url_or_request.get_full_url()
 260             except AttributeError:
 261                 url = url_or_request
 262             basen = '%s_%s' % (video_id, url)
 263             if len(basen) > 240:
 264                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 265                 basen = basen[:240 - len(h)] + h
 266             raw_filename = basen + '.dump'
 267             filename = sanitize_filename(raw_filename, restricted=True)
 268             self.to_screen('Saving request to ' + filename)
 269             with open(filename, 'wb') as outf:
 270                 outf.write(webpage_bytes)
 271
 272         try:
 273             content = webpage_bytes.decode(encoding, 'replace')
 274         except LookupError:
 275             content = webpage_bytes.decode('utf-8', 'replace')
 276
 277         if ('<title>Access to this site is blocked</title>' in content and
 278                 'Websense' in content[:512]):
 279             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 280             blocked_iframe = self._html_search_regex(
 281                 r'<iframe src="([^"]+)"', content,
 282                 'Websense information URL', default=None)
 283             if blocked_iframe:
 284                 msg += ' Visit %s for more details' % blocked_iframe
 285             raise ExtractorError(msg, expected=True)
 286
 287         return (content, urlh)
 288
 289     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 290         """ Returns the data of the page as a string """
 291         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 292         if res is False:
 293             return res
 294         else:
 295             content, _ = res
 296             return content
 297
 298     def _download_xml(self, url_or_request, video_id,
 299                       note='Downloading XML', errnote='Unable to download XML',
 300                       transform_source=None, fatal=True):
 301         """Return the xml as an xml.etree.ElementTree.Element"""
 302         xml_string = self._download_webpage(
 303             url_or_request, video_id, note, errnote, fatal=fatal)
 304         if xml_string is False:
 305             return xml_string
 306         if transform_source:
 307             xml_string = transform_source(xml_string)
 308         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 309
 310     def _download_json(self, url_or_request, video_id,
 311                        note='Downloading JSON metadata',
 312                        errnote='Unable to download JSON metadata',
 313                        transform_source=None,
 314                        fatal=True):
 315         json_string = self._download_webpage(
 316             url_or_request, video_id, note, errnote, fatal=fatal)
 317         if (not fatal) and json_string is False:
 318             return None
 319         if transform_source:
 320             json_string = transform_source(json_string)
 321         try:
 322             return json.loads(json_string)
 323         except ValueError as ve:
 324             raise ExtractorError('Failed to download JSON', cause=ve)
 325
 326     def report_warning(self, msg, video_id=None):
 327         idstr = '' if video_id is None else '%s: ' % video_id
 328         self._downloader.report_warning(
 329             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 330
 331     def to_screen(self, msg):
 332         """Print msg to screen, prefixing it with '[ie_name]'"""
 333         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 334
 335     def report_extraction(self, id_or_name):
 336         """Report information extraction."""
 337         self.to_screen('%s: Extracting information' % id_or_name)
 338
 339     def report_download_webpage(self, video_id):
 340         """Report webpage download."""
 341         self.to_screen('%s: Downloading webpage' % video_id)
 342
 343     def report_age_confirmation(self):
 344         """Report attempt to confirm age."""
 345         self.to_screen('Confirming age')
 346
 347     def report_login(self):
 348         """Report attempt to log in."""
 349         self.to_screen('Logging in')
 350
 351     #Methods for following #608
 352     @staticmethod
 353     def url_result(url, ie=None, video_id=None):
 354         """Returns a url that points to a page that should be processed"""
 355         #TODO: ie should be the class used for getting the info
 356         video_info = {'_type': 'url',
 357                       'url': url,
 358                       'ie_key': ie}
 359         if video_id is not None:
 360             video_info['id'] = video_id
 361         return video_info
 362     @staticmethod
 363     def playlist_result(entries, playlist_id=None, playlist_title=None):
 364         """Returns a playlist"""
 365         video_info = {'_type': 'playlist',
 366                       'entries': entries}
 367         if playlist_id:
 368             video_info['id'] = playlist_id
 369         if playlist_title:
 370             video_info['title'] = playlist_title
 371         return video_info
 372
 373     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 374         """
 375         Perform a regex search on the given string, using a single or a list of
 376         patterns returning the first matching group.
 377         In case of failure return a default value or raise a WARNING or a
 378         RegexNotFoundError, depending on fatal, specifying the field name.
 379         """
 380         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 381             mobj = re.search(pattern, string, flags)
 382         else:
 383             for p in pattern:
 384                 mobj = re.search(p, string, flags)
 385                 if mobj:
 386                     break
 387
 388         if os.name != 'nt' and sys.stderr.isatty():
 389             _name = '\033[0;34m%s\033[0m' % name
 390         else:
 391             _name = name
 392
 393         if mobj:
 394             # return the first matching group
 395             return next(g for g in mobj.groups() if g is not None)
 396         elif default is not _NO_DEFAULT:
 397             return default
 398         elif fatal:
 399             raise RegexNotFoundError('Unable to extract %s' % _name)
 400         else:
 401             self._downloader.report_warning('unable to extract %s; '
 402                 'please report this issue on http://yt-dl.org/bug' % _name)
 403             return None
 404
 405     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 406         """
 407         Like _search_regex, but strips HTML tags and unescapes entities.
 408         """
 409         res = self._search_regex(pattern, string, name, default, fatal, flags)
 410         if res:
 411             return clean_html(res).strip()
 412         else:
 413             return res
 414
 415     def _get_login_info(self):
 416         """
 417         Get the the login info as (username, password)
 418         It will look in the netrc file using the _NETRC_MACHINE value
 419         If there's no info available, return (None, None)
 420         """
 421         if self._downloader is None:
 422             return (None, None)
 423
 424         username = None
 425         password = None
 426         downloader_params = self._downloader.params
 427
 428         # Attempt to use provided username and password or .netrc data
 429         if downloader_params.get('username', None) is not None:
 430             username = downloader_params['username']
 431             password = downloader_params['password']
 432         elif downloader_params.get('usenetrc', False):
 433             try:
 434                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 435                 if info is not None:
 436                     username = info[0]
 437                     password = info[2]
 438                 else:
 439                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 440             except (IOError, netrc.NetrcParseError) as err:
 441                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 442
 443         return (username, password)
 444
 445     def _get_tfa_info(self):
 446         """
 447         Get the two-factor authentication info
 448         TODO - asking the user will be required for sms/phone verify
 449         currently just uses the command line option
 450         If there's no info available, return None
 451         """
 452         if self._downloader is None:
 453             return None
 454         downloader_params = self._downloader.params
 455
 456         if downloader_params.get('twofactor', None) is not None:
 457             return downloader_params['twofactor']
 458
 459         return None
 460
 461     # Helper functions for extracting OpenGraph info
 462     @staticmethod
 463     def _og_regexes(prop):
 464         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 465         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 466         template = r'<meta[^>]+?%s[^>]+?%s'
 467         return [
 468             template % (property_re, content_re),
 469             template % (content_re, property_re),
 470         ]
 471
 472     def _og_search_property(self, prop, html, name=None, **kargs):
 473         if name is None:
 474             name = 'OpenGraph %s' % prop
 475         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 476         if escaped is None:
 477             return None
 478         return unescapeHTML(escaped)
 479
 480     def _og_search_thumbnail(self, html, **kargs):
 481         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 482
 483     def _og_search_description(self, html, **kargs):
 484         return self._og_search_property('description', html, fatal=False, **kargs)
 485
 486     def _og_search_title(self, html, **kargs):
 487         return self._og_search_property('title', html, **kargs)
 488
 489     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 490         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 491         if secure:
 492             regexes = self._og_regexes('video:secure_url') + regexes
 493         return self._html_search_regex(regexes, html, name, **kargs)
 494
 495     def _og_search_url(self, html, **kargs):
 496         return self._og_search_property('url', html, **kargs)
 497
 498     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 499         if display_name is None:
 500             display_name = name
 501         return self._html_search_regex(
 502             r'''(?ix)<meta
 503                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 504                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 505             html, display_name, fatal=fatal, **kwargs)
 506
 507     def _dc_search_uploader(self, html):
 508         return self._html_search_meta('dc.creator', html, 'uploader')
 509
 510     def _rta_search(self, html):
 511         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 512         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 513                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 514                      html):
 515             return 18
 516         return 0
 517
 518     def _media_rating_search(self, html):
 519         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 520         rating = self._html_search_meta('rating', html)
 521
 522         if not rating:
 523             return None
 524
 525         RATING_TABLE = {
 526             'safe for kids': 0,
 527             'general': 8,
 528             '14 years': 14,
 529             'mature': 17,
 530             'restricted': 19,
 531         }
 532         return RATING_TABLE.get(rating.lower(), None)
 533
 534     def _twitter_search_player(self, html):
 535         return self._html_search_meta('twitter:player', html,
 536             'twitter card player')
 537
 538     def _sort_formats(self, formats):
 539         if not formats:
 540             raise ExtractorError('No video formats found')
 541
 542         def _formats_key(f):
 543             # TODO remove the following workaround
 544             from ..utils import determine_ext
 545             if not f.get('ext') and 'url' in f:
 546                 f['ext'] = determine_ext(f['url'])
 547
 548             preference = f.get('preference')
 549             if preference is None:
 550                 proto = f.get('protocol')
 551                 if proto is None:
 552                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 553
 554                 preference = 0 if proto in ['http', 'https'] else -0.1
 555                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 556                     preference -= 0.5
 557
 558             if f.get('vcodec') == 'none':  # audio only
 559                 if self._downloader.params.get('prefer_free_formats'):
 560                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 561                 else:
 562                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 563                 ext_preference = 0
 564                 try:
 565                     audio_ext_preference = ORDER.index(f['ext'])
 566                 except ValueError:
 567                     audio_ext_preference = -1
 568             else:
 569                 if self._downloader.params.get('prefer_free_formats'):
 570                     ORDER = ['flv', 'mp4', 'webm']
 571                 else:
 572                     ORDER = ['webm', 'flv', 'mp4']
 573                 try:
 574                     ext_preference = ORDER.index(f['ext'])
 575                 except ValueError:
 576                     ext_preference = -1
 577                 audio_ext_preference = 0
 578
 579             return (
 580                 preference,
 581                 f.get('quality') if f.get('quality') is not None else -1,
 582                 f.get('height') if f.get('height') is not None else -1,
 583                 f.get('width') if f.get('width') is not None else -1,
 584                 ext_preference,
 585                 f.get('tbr') if f.get('tbr') is not None else -1,
 586                 f.get('vbr') if f.get('vbr') is not None else -1,
 587                 f.get('abr') if f.get('abr') is not None else -1,
 588                 audio_ext_preference,
 589                 f.get('filesize') if f.get('filesize') is not None else -1,
 590                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 591                 f.get('format_id'),
 592             )
 593         formats.sort(key=_formats_key)
 594
 595     def http_scheme(self):
 596         """ Either "https:" or "https:", depending on the user's preferences """
 597         return (
 598             'http:'
 599             if self._downloader.params.get('prefer_insecure', False)
 600             else 'https:')
 601
 602     def _proto_relative_url(self, url, scheme=None):
 603         if url is None:
 604             return url
 605         if url.startswith('//'):
 606             if scheme is None:
 607                 scheme = self.http_scheme()
 608             return scheme + url
 609         else:
 610             return url
 611
 612     def _sleep(self, timeout, video_id, msg_template=None):
 613         if msg_template is None:
 614             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 615         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 616         self.to_screen(msg)
 617         time.sleep(timeout)
 618
 619     def _extract_f4m_formats(self, manifest_url, video_id):
 620         manifest = self._download_xml(
 621             manifest_url, video_id, 'Downloading f4m manifest',
 622             'Unable to download f4m manifest')
 623
 624         formats = []
 625         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 626         for i, media_el in enumerate(media_nodes):
 627             tbr = int_or_none(media_el.attrib.get('bitrate'))
 628             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 629             formats.append({
 630                 'format_id': format_id,
 631                 'url': manifest_url,
 632                 'ext': 'flv',
 633                 'tbr': tbr,
 634                 'width': int_or_none(media_el.attrib.get('width')),
 635                 'height': int_or_none(media_el.attrib.get('height')),
 636             })
 637         self._sort_formats(formats)
 638
 639         return formats
 640
 641     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None):
 642         formats = [{
 643             'format_id': 'm3u8-meta',
 644             'url': m3u8_url,
 645             'ext': ext,
 646             'protocol': 'm3u8',
 647             'preference': -1,
 648             'resolution': 'multiple',
 649             'format_note': 'Quality selection URL',
 650         }]
 651
 652         m3u8_doc = self._download_webpage(m3u8_url, video_id)
 653         last_info = None
 654         kv_rex = re.compile(
 655             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 656         for line in m3u8_doc.splitlines():
 657             if line.startswith('#EXT-X-STREAM-INF:'):
 658                 last_info = {}
 659                 for m in kv_rex.finditer(line):
 660                     v = m.group('val')
 661                     if v.startswith('"'):
 662                         v = v[1:-1]
 663                     last_info[m.group('key')] = v
 664             elif line.startswith('#') or not line.strip():
 665                 continue
 666             else:
 667                 if last_info is None:
 668                     formats.append({'url': line})
 669                     continue
 670                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 671
 672                 f = {
 673                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 674                     'url': line.strip(),
 675                     'tbr': tbr,
 676                     'ext': ext,
 677                 }
 678                 codecs = last_info.get('CODECS')
 679                 if codecs:
 680                     # TODO: looks like video codec is not always necessarily goes first
 681                     va_codecs = codecs.split(',')
 682                     if va_codecs[0]:
 683                         f['vcodec'] = va_codecs[0].partition('.')[0]
 684                     if len(va_codecs) > 1 and va_codecs[1]:
 685                         f['acodec'] = va_codecs[1].partition('.')[0]
 686                 resolution = last_info.get('RESOLUTION')
 687                 if resolution:
 688                     width_str, height_str = resolution.split('x')
 689                     f['width'] = int(width_str)
 690                     f['height'] = int(height_str)
 691                 formats.append(f)
 692                 last_info = {}
 693         self._sort_formats(formats)
 694         return formats
 695
 696
 697 class SearchInfoExtractor(InfoExtractor):
 698     """
 699     Base class for paged search queries extractors.
 700     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 701     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 702     """
 703
 704     @classmethod
 705     def _make_valid_url(cls):
 706         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 707
 708     @classmethod
 709     def suitable(cls, url):
 710         return re.match(cls._make_valid_url(), url) is not None
 711
 712     def _real_extract(self, query):
 713         mobj = re.match(self._make_valid_url(), query)
 714         if mobj is None:
 715             raise ExtractorError('Invalid search query "%s"' % query)
 716
 717         prefix = mobj.group('prefix')
 718         query = mobj.group('query')
 719         if prefix == '':
 720             return self._get_n_results(query, 1)
 721         elif prefix == 'all':
 722             return self._get_n_results(query, self._MAX_RESULTS)
 723         else:
 724             n = int(prefix)
 725             if n <= 0:
 726                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 727             elif n > self._MAX_RESULTS:
 728                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 729                 n = self._MAX_RESULTS
 730             return self._get_n_results(query, n)
 731
 732     def _get_n_results(self, query, n):
 733         """Get a specified number of results for a query"""
 734         raise NotImplementedError("This method must be implemented by subclasses")
 735
 736     @property
 737     def SEARCH_KEY(self):
 738         return self._SEARCH_KEY