_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..utils import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21
  22     clean_html,
  23     compiled_regex_type,
  24     ExtractorError,
  25     float_or_none,
  26     int_or_none,
  27     RegexNotFoundError,
  28     sanitize_filename,
  29     unescapeHTML,
  30 )
  31 _NO_DEFAULT = object()
  32
  33
  34 class InfoExtractor(object):
  35     """Information Extractor class.
  36
  37     Information extractors are the classes that, given a URL, extract
  38     information about the video (or videos) the URL refers to. This
  39     information includes the real video URL, the video title, author and
  40     others. The information is stored in a dictionary which is then
  41     passed to the FileDownloader. The FileDownloader processes this
  42     information possibly downloading the video to the file system, among
  43     other possible outcomes.
  44
  45     The dictionaries must include the following fields:
  46
  47     id:             Video identifier.
  48     title:          Video title, unescaped.
  49
  50     Additionally, it must contain either a formats entry or a url one:
  51
  52     formats:        A list of dictionaries for each format available, ordered
  53                     from worst to best quality.
  54
  55                     Potential fields:
  56                     * url        Mandatory. The URL of the video file
  57                     * ext        Will be calculated from url if missing
  58                     * format     A human-readable description of the format
  59                                  ("mp4 container with h264/opus").
  60                                  Calculated from the format_id, width, height.
  61                                  and format_note fields if missing.
  62                     * format_id  A short description of the format
  63                                  ("mp4_h264_opus" or "19").
  64                                 Technically optional, but strongly recommended.
  65                     * format_note Additional info about the format
  66                                  ("3D" or "DASH video")
  67                     * width      Width of the video, if known
  68                     * height     Height of the video, if known
  69                     * resolution Textual description of width and height
  70                     * tbr        Average bitrate of audio and video in KBit/s
  71                     * abr        Average audio bitrate in KBit/s
  72                     * acodec     Name of the audio codec in use
  73                     * asr        Audio sampling rate in Hertz
  74                     * vbr        Average video bitrate in KBit/s
  75                     * vcodec     Name of the video codec in use
  76                     * container  Name of the container format
  77                     * filesize   The number of bytes, if known in advance
  78                     * filesize_approx  An estimate for the number of bytes
  79                     * player_url SWF Player URL (used for rtmpdump).
  80                     * protocol   The protocol that will be used for the actual
  81                                  download, lower-case.
  82                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  83                     * preference Order number of this format. If this field is
  84                                  present and not None, the formats get sorted
  85                                  by this field, regardless of all other values.
  86                                  -1 for default (order by other properties),
  87                                  -2 or smaller for less than default.
  88                     * quality    Order number of the video quality of this
  89                                  format, irrespective of the file format.
  90                                  -1 for default (order by other properties),
  91                                  -2 or smaller for less than default.
  92                     * http_referer  HTTP Referer header value to set.
  93                     * http_method  HTTP method to use for the download.
  94                     * http_headers  A dictionary of additional HTTP headers
  95                                  to add to the request.
  96                     * http_post_data  Additional data to send with a POST
  97                                  request.
  98     url:            Final video URL.
  99     ext:            Video filename extension.
 100     format:         The video format, defaults to ext (used for --get-format)
 101     player_url:     SWF Player URL (used for rtmpdump).
 102
 103     The following fields are optional:
 104
 105     display_id      An alternative identifier for the video, not necessarily
 106                     unique, but available before title. Typically, id is
 107                     something like "4234987", title "Dancing naked mole rats",
 108                     and display_id "dancing-naked-mole-rats"
 109     thumbnails:     A list of dictionaries, with the following entries:
 110                         * "url"
 111                         * "width" (optional, int)
 112                         * "height" (optional, int)
 113                         * "resolution" (optional, string "{width}x{height"},
 114                                         deprecated)
 115     thumbnail:      Full URL to a video thumbnail image.
 116     description:    One-line video description.
 117     uploader:       Full name of the video uploader.
 118     timestamp:      UNIX timestamp of the moment the video became available.
 119     upload_date:    Video upload date (YYYYMMDD).
 120                     If not explicitly set, calculated from timestamp.
 121     uploader_id:    Nickname or id of the video uploader.
 122     location:       Physical location where the video was filmed.
 123     subtitles:      The subtitle file contents as a dictionary in the format
 124                     {language: subtitles}.
 125     duration:       Length of the video in seconds, as an integer.
 126     view_count:     How many users have watched the video on the platform.
 127     like_count:     Number of positive ratings of the video
 128     dislike_count:  Number of negative ratings of the video
 129     comment_count:  Number of comments on the video
 130     age_limit:      Age restriction for the video, as an integer (years)
 131     webpage_url:    The url to the video webpage, if given to youtube-dl it
 132                     should allow to get the same result again. (It will be set
 133                     by YoutubeDL if it's missing)
 134     categories:     A list of categories that the video falls in, for example
 135                     ["Sports", "Berlin"]
 136     is_live:        True, False, or None (=unknown). Whether this video is a
 137                     live stream that goes on instead of a fixed-length video.
 138
 139     Unless mentioned otherwise, the fields should be Unicode strings.
 140
 141     Subclasses of this one should re-define the _real_initialize() and
 142     _real_extract() methods and define a _VALID_URL regexp.
 143     Probably, they should also be added to the list of extractors.
 144
 145     Finally, the _WORKING attribute should be set to False for broken IEs
 146     in order to warn the users and skip the tests.
 147     """
 148
 149     _ready = False
 150     _downloader = None
 151     _WORKING = True
 152
 153     def __init__(self, downloader=None):
 154         """Constructor. Receives an optional downloader."""
 155         self._ready = False
 156         self.set_downloader(downloader)
 157
 158     @classmethod
 159     def suitable(cls, url):
 160         """Receives a URL and returns True if suitable for this IE."""
 161
 162         # This does not use has/getattr intentionally - we want to know whether
 163         # we have cached the regexp for *this* class, whereas getattr would also
 164         # match the superclass
 165         if '_VALID_URL_RE' not in cls.__dict__:
 166             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 167         return cls._VALID_URL_RE.match(url) is not None
 168
 169     @classmethod
 170     def _match_id(cls, url):
 171         if '_VALID_URL_RE' not in cls.__dict__:
 172             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 173         m = cls._VALID_URL_RE.match(url)
 174         assert m
 175         return m.group('id')
 176
 177     @classmethod
 178     def working(cls):
 179         """Getter method for _WORKING."""
 180         return cls._WORKING
 181
 182     def initialize(self):
 183         """Initializes an instance (authentication, etc)."""
 184         if not self._ready:
 185             self._real_initialize()
 186             self._ready = True
 187
 188     def extract(self, url):
 189         """Extracts URL information and returns it in list of dicts."""
 190         self.initialize()
 191         return self._real_extract(url)
 192
 193     def set_downloader(self, downloader):
 194         """Sets the downloader for this IE."""
 195         self._downloader = downloader
 196
 197     def _real_initialize(self):
 198         """Real initialization process. Redefine in subclasses."""
 199         pass
 200
 201     def _real_extract(self, url):
 202         """Real extraction process. Redefine in subclasses."""
 203         pass
 204
 205     @classmethod
 206     def ie_key(cls):
 207         """A string for getting the InfoExtractor with get_info_extractor"""
 208         return cls.__name__[:-2]
 209
 210     @property
 211     def IE_NAME(self):
 212         return type(self).__name__[:-2]
 213
 214     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 215         """ Returns the response handle """
 216         if note is None:
 217             self.report_download_webpage(video_id)
 218         elif note is not False:
 219             if video_id is None:
 220                 self.to_screen('%s' % (note,))
 221             else:
 222                 self.to_screen('%s: %s' % (video_id, note))
 223         try:
 224             return self._downloader.urlopen(url_or_request)
 225         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 226             if errnote is False:
 227                 return False
 228             if errnote is None:
 229                 errnote = 'Unable to download webpage'
 230             errmsg = '%s: %s' % (errnote, compat_str(err))
 231             if fatal:
 232                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 233             else:
 234                 self._downloader.report_warning(errmsg)
 235                 return False
 236
 237     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 238         """ Returns a tuple (page content as string, URL handle) """
 239
 240         # Strip hashes from the URL (#1038)
 241         if isinstance(url_or_request, (compat_str, str)):
 242             url_or_request = url_or_request.partition('#')[0]
 243
 244         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 245         if urlh is False:
 246             assert not fatal
 247             return False
 248         content_type = urlh.headers.get('Content-Type', '')
 249         webpage_bytes = urlh.read()
 250         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 251         if m:
 252             encoding = m.group(1)
 253         else:
 254             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 255                           webpage_bytes[:1024])
 256             if m:
 257                 encoding = m.group(1).decode('ascii')
 258             elif webpage_bytes.startswith(b'\xff\xfe'):
 259                 encoding = 'utf-16'
 260             else:
 261                 encoding = 'utf-8'
 262         if self._downloader.params.get('dump_intermediate_pages', False):
 263             try:
 264                 url = url_or_request.get_full_url()
 265             except AttributeError:
 266                 url = url_or_request
 267             self.to_screen('Dumping request to ' + url)
 268             dump = base64.b64encode(webpage_bytes).decode('ascii')
 269             self._downloader.to_screen(dump)
 270         if self._downloader.params.get('write_pages', False):
 271             try:
 272                 url = url_or_request.get_full_url()
 273             except AttributeError:
 274                 url = url_or_request
 275             basen = '%s_%s' % (video_id, url)
 276             if len(basen) > 240:
 277                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 278                 basen = basen[:240 - len(h)] + h
 279             raw_filename = basen + '.dump'
 280             filename = sanitize_filename(raw_filename, restricted=True)
 281             self.to_screen('Saving request to ' + filename)
 282             with open(filename, 'wb') as outf:
 283                 outf.write(webpage_bytes)
 284
 285         try:
 286             content = webpage_bytes.decode(encoding, 'replace')
 287         except LookupError:
 288             content = webpage_bytes.decode('utf-8', 'replace')
 289
 290         if ('<title>Access to this site is blocked</title>' in content and
 291                 'Websense' in content[:512]):
 292             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 293             blocked_iframe = self._html_search_regex(
 294                 r'<iframe src="([^"]+)"', content,
 295                 'Websense information URL', default=None)
 296             if blocked_iframe:
 297                 msg += ' Visit %s for more details' % blocked_iframe
 298             raise ExtractorError(msg, expected=True)
 299
 300         return (content, urlh)
 301
 302     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 303         """ Returns the data of the page as a string """
 304         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 305         if res is False:
 306             return res
 307         else:
 308             content, _ = res
 309             return content
 310
 311     def _download_xml(self, url_or_request, video_id,
 312                       note='Downloading XML', errnote='Unable to download XML',
 313                       transform_source=None, fatal=True):
 314         """Return the xml as an xml.etree.ElementTree.Element"""
 315         xml_string = self._download_webpage(
 316             url_or_request, video_id, note, errnote, fatal=fatal)
 317         if xml_string is False:
 318             return xml_string
 319         if transform_source:
 320             xml_string = transform_source(xml_string)
 321         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 322
 323     def _download_json(self, url_or_request, video_id,
 324                        note='Downloading JSON metadata',
 325                        errnote='Unable to download JSON metadata',
 326                        transform_source=None,
 327                        fatal=True):
 328         json_string = self._download_webpage(
 329             url_or_request, video_id, note, errnote, fatal=fatal)
 330         if (not fatal) and json_string is False:
 331             return None
 332         if transform_source:
 333             json_string = transform_source(json_string)
 334         try:
 335             return json.loads(json_string)
 336         except ValueError as ve:
 337             raise ExtractorError('Failed to download JSON', cause=ve)
 338
 339     def report_warning(self, msg, video_id=None):
 340         idstr = '' if video_id is None else '%s: ' % video_id
 341         self._downloader.report_warning(
 342             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 343
 344     def to_screen(self, msg):
 345         """Print msg to screen, prefixing it with '[ie_name]'"""
 346         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 347
 348     def report_extraction(self, id_or_name):
 349         """Report information extraction."""
 350         self.to_screen('%s: Extracting information' % id_or_name)
 351
 352     def report_download_webpage(self, video_id):
 353         """Report webpage download."""
 354         self.to_screen('%s: Downloading webpage' % video_id)
 355
 356     def report_age_confirmation(self):
 357         """Report attempt to confirm age."""
 358         self.to_screen('Confirming age')
 359
 360     def report_login(self):
 361         """Report attempt to log in."""
 362         self.to_screen('Logging in')
 363
 364     #Methods for following #608
 365     @staticmethod
 366     def url_result(url, ie=None, video_id=None):
 367         """Returns a url that points to a page that should be processed"""
 368         #TODO: ie should be the class used for getting the info
 369         video_info = {'_type': 'url',
 370                       'url': url,
 371                       'ie_key': ie}
 372         if video_id is not None:
 373             video_info['id'] = video_id
 374         return video_info
 375     @staticmethod
 376     def playlist_result(entries, playlist_id=None, playlist_title=None):
 377         """Returns a playlist"""
 378         video_info = {'_type': 'playlist',
 379                       'entries': entries}
 380         if playlist_id:
 381             video_info['id'] = playlist_id
 382         if playlist_title:
 383             video_info['title'] = playlist_title
 384         return video_info
 385
 386     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 387         """
 388         Perform a regex search on the given string, using a single or a list of
 389         patterns returning the first matching group.
 390         In case of failure return a default value or raise a WARNING or a
 391         RegexNotFoundError, depending on fatal, specifying the field name.
 392         """
 393         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 394             mobj = re.search(pattern, string, flags)
 395         else:
 396             for p in pattern:
 397                 mobj = re.search(p, string, flags)
 398                 if mobj:
 399                     break
 400
 401         if os.name != 'nt' and sys.stderr.isatty():
 402             _name = '\033[0;34m%s\033[0m' % name
 403         else:
 404             _name = name
 405
 406         if mobj:
 407             # return the first matching group
 408             return next(g for g in mobj.groups() if g is not None)
 409         elif default is not _NO_DEFAULT:
 410             return default
 411         elif fatal:
 412             raise RegexNotFoundError('Unable to extract %s' % _name)
 413         else:
 414             self._downloader.report_warning('unable to extract %s; '
 415                 'please report this issue on http://yt-dl.org/bug' % _name)
 416             return None
 417
 418     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 419         """
 420         Like _search_regex, but strips HTML tags and unescapes entities.
 421         """
 422         res = self._search_regex(pattern, string, name, default, fatal, flags)
 423         if res:
 424             return clean_html(res).strip()
 425         else:
 426             return res
 427
 428     def _get_login_info(self):
 429         """
 430         Get the the login info as (username, password)
 431         It will look in the netrc file using the _NETRC_MACHINE value
 432         If there's no info available, return (None, None)
 433         """
 434         if self._downloader is None:
 435             return (None, None)
 436
 437         username = None
 438         password = None
 439         downloader_params = self._downloader.params
 440
 441         # Attempt to use provided username and password or .netrc data
 442         if downloader_params.get('username', None) is not None:
 443             username = downloader_params['username']
 444             password = downloader_params['password']
 445         elif downloader_params.get('usenetrc', False):
 446             try:
 447                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 448                 if info is not None:
 449                     username = info[0]
 450                     password = info[2]
 451                 else:
 452                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 453             except (IOError, netrc.NetrcParseError) as err:
 454                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 455
 456         return (username, password)
 457
 458     def _get_tfa_info(self):
 459         """
 460         Get the two-factor authentication info
 461         TODO - asking the user will be required for sms/phone verify
 462         currently just uses the command line option
 463         If there's no info available, return None
 464         """
 465         if self._downloader is None:
 466             return None
 467         downloader_params = self._downloader.params
 468
 469         if downloader_params.get('twofactor', None) is not None:
 470             return downloader_params['twofactor']
 471
 472         return None
 473
 474     # Helper functions for extracting OpenGraph info
 475     @staticmethod
 476     def _og_regexes(prop):
 477         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 478         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 479         template = r'<meta[^>]+?%s[^>]+?%s'
 480         return [
 481             template % (property_re, content_re),
 482             template % (content_re, property_re),
 483         ]
 484
 485     def _og_search_property(self, prop, html, name=None, **kargs):
 486         if name is None:
 487             name = 'OpenGraph %s' % prop
 488         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 489         if escaped is None:
 490             return None
 491         return unescapeHTML(escaped)
 492
 493     def _og_search_thumbnail(self, html, **kargs):
 494         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 495
 496     def _og_search_description(self, html, **kargs):
 497         return self._og_search_property('description', html, fatal=False, **kargs)
 498
 499     def _og_search_title(self, html, **kargs):
 500         return self._og_search_property('title', html, **kargs)
 501
 502     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 503         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 504         if secure:
 505             regexes = self._og_regexes('video:secure_url') + regexes
 506         return self._html_search_regex(regexes, html, name, **kargs)
 507
 508     def _og_search_url(self, html, **kargs):
 509         return self._og_search_property('url', html, **kargs)
 510
 511     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 512         if display_name is None:
 513             display_name = name
 514         return self._html_search_regex(
 515             r'''(?ix)<meta
 516                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 517                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 518             html, display_name, fatal=fatal, **kwargs)
 519
 520     def _dc_search_uploader(self, html):
 521         return self._html_search_meta('dc.creator', html, 'uploader')
 522
 523     def _rta_search(self, html):
 524         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 525         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 526                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 527                      html):
 528             return 18
 529         return 0
 530
 531     def _media_rating_search(self, html):
 532         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 533         rating = self._html_search_meta('rating', html)
 534
 535         if not rating:
 536             return None
 537
 538         RATING_TABLE = {
 539             'safe for kids': 0,
 540             'general': 8,
 541             '14 years': 14,
 542             'mature': 17,
 543             'restricted': 19,
 544         }
 545         return RATING_TABLE.get(rating.lower(), None)
 546
 547     def _twitter_search_player(self, html):
 548         return self._html_search_meta('twitter:player', html,
 549             'twitter card player')
 550
 551     def _sort_formats(self, formats):
 552         if not formats:
 553             raise ExtractorError('No video formats found')
 554
 555         def _formats_key(f):
 556             # TODO remove the following workaround
 557             from ..utils import determine_ext
 558             if not f.get('ext') and 'url' in f:
 559                 f['ext'] = determine_ext(f['url'])
 560
 561             preference = f.get('preference')
 562             if preference is None:
 563                 proto = f.get('protocol')
 564                 if proto is None:
 565                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 566
 567                 preference = 0 if proto in ['http', 'https'] else -0.1
 568                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 569                     preference -= 0.5
 570
 571             if f.get('vcodec') == 'none':  # audio only
 572                 if self._downloader.params.get('prefer_free_formats'):
 573                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 574                 else:
 575                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 576                 ext_preference = 0
 577                 try:
 578                     audio_ext_preference = ORDER.index(f['ext'])
 579                 except ValueError:
 580                     audio_ext_preference = -1
 581             else:
 582                 if self._downloader.params.get('prefer_free_formats'):
 583                     ORDER = ['flv', 'mp4', 'webm']
 584                 else:
 585                     ORDER = ['webm', 'flv', 'mp4']
 586                 try:
 587                     ext_preference = ORDER.index(f['ext'])
 588                 except ValueError:
 589                     ext_preference = -1
 590                 audio_ext_preference = 0
 591
 592             return (
 593                 preference,
 594                 f.get('quality') if f.get('quality') is not None else -1,
 595                 f.get('height') if f.get('height') is not None else -1,
 596                 f.get('width') if f.get('width') is not None else -1,
 597                 ext_preference,
 598                 f.get('tbr') if f.get('tbr') is not None else -1,
 599                 f.get('vbr') if f.get('vbr') is not None else -1,
 600                 f.get('abr') if f.get('abr') is not None else -1,
 601                 audio_ext_preference,
 602                 f.get('filesize') if f.get('filesize') is not None else -1,
 603                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 604                 f.get('format_id'),
 605             )
 606         formats.sort(key=_formats_key)
 607
 608     def http_scheme(self):
 609         """ Either "https:" or "https:", depending on the user's preferences """
 610         return (
 611             'http:'
 612             if self._downloader.params.get('prefer_insecure', False)
 613             else 'https:')
 614
 615     def _proto_relative_url(self, url, scheme=None):
 616         if url is None:
 617             return url
 618         if url.startswith('//'):
 619             if scheme is None:
 620                 scheme = self.http_scheme()
 621             return scheme + url
 622         else:
 623             return url
 624
 625     def _sleep(self, timeout, video_id, msg_template=None):
 626         if msg_template is None:
 627             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 628         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 629         self.to_screen(msg)
 630         time.sleep(timeout)
 631
 632     def _extract_f4m_formats(self, manifest_url, video_id):
 633         manifest = self._download_xml(
 634             manifest_url, video_id, 'Downloading f4m manifest',
 635             'Unable to download f4m manifest')
 636
 637         formats = []
 638         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 639         for i, media_el in enumerate(media_nodes):
 640             tbr = int_or_none(media_el.attrib.get('bitrate'))
 641             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 642             formats.append({
 643                 'format_id': format_id,
 644                 'url': manifest_url,
 645                 'ext': 'flv',
 646                 'tbr': tbr,
 647                 'width': int_or_none(media_el.attrib.get('width')),
 648                 'height': int_or_none(media_el.attrib.get('height')),
 649             })
 650         self._sort_formats(formats)
 651
 652         return formats
 653
 654     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 655                               entry_protocol='m3u8', preference=None):
 656
 657         formats = [{
 658             'format_id': 'm3u8-meta',
 659             'url': m3u8_url,
 660             'ext': ext,
 661             'protocol': 'm3u8',
 662             'preference': -1,
 663             'resolution': 'multiple',
 664             'format_note': 'Quality selection URL',
 665         }]
 666
 667         format_url = lambda u: (
 668             u
 669             if re.match(r'^https?://', u)
 670             else compat_urlparse.urljoin(m3u8_url, u))
 671
 672         m3u8_doc = self._download_webpage(m3u8_url, video_id)
 673         last_info = None
 674         kv_rex = re.compile(
 675             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 676         for line in m3u8_doc.splitlines():
 677             if line.startswith('#EXT-X-STREAM-INF:'):
 678                 last_info = {}
 679                 for m in kv_rex.finditer(line):
 680                     v = m.group('val')
 681                     if v.startswith('"'):
 682                         v = v[1:-1]
 683                     last_info[m.group('key')] = v
 684             elif line.startswith('#') or not line.strip():
 685                 continue
 686             else:
 687                 if last_info is None:
 688                     formats.append({'url': format_url(line)})
 689                     continue
 690                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 691
 692                 f = {
 693                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 694                     'url': format_url(line.strip()),
 695                     'tbr': tbr,
 696                     'ext': ext,
 697                     'protocol': entry_protocol,
 698                     'preference': preference,
 699                 }
 700                 codecs = last_info.get('CODECS')
 701                 if codecs:
 702                     # TODO: looks like video codec is not always necessarily goes first
 703                     va_codecs = codecs.split(',')
 704                     if va_codecs[0]:
 705                         f['vcodec'] = va_codecs[0].partition('.')[0]
 706                     if len(va_codecs) > 1 and va_codecs[1]:
 707                         f['acodec'] = va_codecs[1].partition('.')[0]
 708                 resolution = last_info.get('RESOLUTION')
 709                 if resolution:
 710                     width_str, height_str = resolution.split('x')
 711                     f['width'] = int(width_str)
 712                     f['height'] = int(height_str)
 713                 formats.append(f)
 714                 last_info = {}
 715         self._sort_formats(formats)
 716         return formats
 717
 718     def _live_title(self, name):
 719         """ Generate the title for a live video """
 720         now = datetime.datetime.now()
 721         now_str = now.strftime("%Y-%m-%d %H:%M")
 722         return name + ' ' + now_str
 723
 724     def _int(self, v, name, fatal=False, **kwargs):
 725         res = int_or_none(v, **kwargs)
 726         if 'get_attr' in kwargs:
 727             print(getattr(v, kwargs['get_attr']))
 728         if res is None:
 729             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 730             if fatal:
 731                 raise ExtractorError(msg)
 732             else:
 733                 self._downloader.report_warning(msg)
 734         return res
 735
 736     def _float(self, v, name, fatal=False, **kwargs):
 737         res = float_or_none(v, **kwargs)
 738         if res is None:
 739             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 740             if fatal:
 741                 raise ExtractorError(msg)
 742             else:
 743                 self._downloader.report_warning(msg)
 744         return res
 745
 746
 747 class SearchInfoExtractor(InfoExtractor):
 748     """
 749     Base class for paged search queries extractors.
 750     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 751     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 752     """
 753
 754     @classmethod
 755     def _make_valid_url(cls):
 756         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 757
 758     @classmethod
 759     def suitable(cls, url):
 760         return re.match(cls._make_valid_url(), url) is not None
 761
 762     def _real_extract(self, query):
 763         mobj = re.match(self._make_valid_url(), query)
 764         if mobj is None:
 765             raise ExtractorError('Invalid search query "%s"' % query)
 766
 767         prefix = mobj.group('prefix')
 768         query = mobj.group('query')
 769         if prefix == '':
 770             return self._get_n_results(query, 1)
 771         elif prefix == 'all':
 772             return self._get_n_results(query, self._MAX_RESULTS)
 773         else:
 774             n = int(prefix)
 775             if n <= 0:
 776                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 777             elif n > self._MAX_RESULTS:
 778                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 779                 n = self._MAX_RESULTS
 780             return self._get_n_results(query, n)
 781
 782     def _get_n_results(self, query, n):
 783         """Get a specified number of results for a query"""
 784         raise NotImplementedError("This method must be implemented by subclasses")
 785
 786     @property
 787     def SEARCH_KEY(self):
 788         return self._SEARCH_KEY