_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..utils import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21
  22     clean_html,
  23     compiled_regex_type,
  24     ExtractorError,
  25     float_or_none,
  26     int_or_none,
  27     RegexNotFoundError,
  28     sanitize_filename,
  29     unescapeHTML,
  30 )
  31 _NO_DEFAULT = object()
  32
  33
  34 class InfoExtractor(object):
  35     """Information Extractor class.
  36
  37     Information extractors are the classes that, given a URL, extract
  38     information about the video (or videos) the URL refers to. This
  39     information includes the real video URL, the video title, author and
  40     others. The information is stored in a dictionary which is then
  41     passed to the FileDownloader. The FileDownloader processes this
  42     information possibly downloading the video to the file system, among
  43     other possible outcomes.
  44
  45     The dictionaries must include the following fields:
  46
  47     id:             Video identifier.
  48     title:          Video title, unescaped.
  49
  50     Additionally, it must contain either a formats entry or a url one:
  51
  52     formats:        A list of dictionaries for each format available, ordered
  53                     from worst to best quality.
  54
  55                     Potential fields:
  56                     * url        Mandatory. The URL of the video file
  57                     * ext        Will be calculated from url if missing
  58                     * format     A human-readable description of the format
  59                                  ("mp4 container with h264/opus").
  60                                  Calculated from the format_id, width, height.
  61                                  and format_note fields if missing.
  62                     * format_id  A short description of the format
  63                                  ("mp4_h264_opus" or "19").
  64                                 Technically optional, but strongly recommended.
  65                     * format_note Additional info about the format
  66                                  ("3D" or "DASH video")
  67                     * width      Width of the video, if known
  68                     * height     Height of the video, if known
  69                     * resolution Textual description of width and height
  70                     * tbr        Average bitrate of audio and video in KBit/s
  71                     * abr        Average audio bitrate in KBit/s
  72                     * acodec     Name of the audio codec in use
  73                     * asr        Audio sampling rate in Hertz
  74                     * vbr        Average video bitrate in KBit/s
  75                     * vcodec     Name of the video codec in use
  76                     * container  Name of the container format
  77                     * filesize   The number of bytes, if known in advance
  78                     * filesize_approx  An estimate for the number of bytes
  79                     * player_url SWF Player URL (used for rtmpdump).
  80                     * protocol   The protocol that will be used for the actual
  81                                  download, lower-case.
  82                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  83                     * preference Order number of this format. If this field is
  84                                  present and not None, the formats get sorted
  85                                  by this field, regardless of all other values.
  86                                  -1 for default (order by other properties),
  87                                  -2 or smaller for less than default.
  88                     * quality    Order number of the video quality of this
  89                                  format, irrespective of the file format.
  90                                  -1 for default (order by other properties),
  91                                  -2 or smaller for less than default.
  92                     * http_referer  HTTP Referer header value to set.
  93                     * http_method  HTTP method to use for the download.
  94                     * http_headers  A dictionary of additional HTTP headers
  95                                  to add to the request.
  96                     * http_post_data  Additional data to send with a POST
  97                                  request.
  98     url:            Final video URL.
  99     ext:            Video filename extension.
 100     format:         The video format, defaults to ext (used for --get-format)
 101     player_url:     SWF Player URL (used for rtmpdump).
 102
 103     The following fields are optional:
 104
 105     display_id      An alternative identifier for the video, not necessarily
 106                     unique, but available before title. Typically, id is
 107                     something like "4234987", title "Dancing naked mole rats",
 108                     and display_id "dancing-naked-mole-rats"
 109     thumbnails:     A list of dictionaries, with the following entries:
 110                         * "url"
 111                         * "width" (optional, int)
 112                         * "height" (optional, int)
 113                         * "resolution" (optional, string "{width}x{height"},
 114                                         deprecated)
 115     thumbnail:      Full URL to a video thumbnail image.
 116     description:    One-line video description.
 117     uploader:       Full name of the video uploader.
 118     timestamp:      UNIX timestamp of the moment the video became available.
 119     upload_date:    Video upload date (YYYYMMDD).
 120                     If not explicitly set, calculated from timestamp.
 121     uploader_id:    Nickname or id of the video uploader.
 122     location:       Physical location where the video was filmed.
 123     subtitles:      The subtitle file contents as a dictionary in the format
 124                     {language: subtitles}.
 125     duration:       Length of the video in seconds, as an integer.
 126     view_count:     How many users have watched the video on the platform.
 127     like_count:     Number of positive ratings of the video
 128     dislike_count:  Number of negative ratings of the video
 129     comment_count:  Number of comments on the video
 130     age_limit:      Age restriction for the video, as an integer (years)
 131     webpage_url:    The url to the video webpage, if given to youtube-dl it
 132                     should allow to get the same result again. (It will be set
 133                     by YoutubeDL if it's missing)
 134     categories:     A list of categories that the video falls in, for example
 135                     ["Sports", "Berlin"]
 136     is_live:        True, False, or None (=unknown). Whether this video is a
 137                     live stream that goes on instead of a fixed-length video.
 138
 139     Unless mentioned otherwise, the fields should be Unicode strings.
 140
 141     Unless mentioned otherwise, None is equivalent to absence of information.
 142
 143     Subclasses of this one should re-define the _real_initialize() and
 144     _real_extract() methods and define a _VALID_URL regexp.
 145     Probably, they should also be added to the list of extractors.
 146
 147     Finally, the _WORKING attribute should be set to False for broken IEs
 148     in order to warn the users and skip the tests.
 149     """
 150
 151     _ready = False
 152     _downloader = None
 153     _WORKING = True
 154
 155     def __init__(self, downloader=None):
 156         """Constructor. Receives an optional downloader."""
 157         self._ready = False
 158         self.set_downloader(downloader)
 159
 160     @classmethod
 161     def suitable(cls, url):
 162         """Receives a URL and returns True if suitable for this IE."""
 163
 164         # This does not use has/getattr intentionally - we want to know whether
 165         # we have cached the regexp for *this* class, whereas getattr would also
 166         # match the superclass
 167         if '_VALID_URL_RE' not in cls.__dict__:
 168             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 169         return cls._VALID_URL_RE.match(url) is not None
 170
 171     @classmethod
 172     def _match_id(cls, url):
 173         if '_VALID_URL_RE' not in cls.__dict__:
 174             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 175         m = cls._VALID_URL_RE.match(url)
 176         assert m
 177         return m.group('id')
 178
 179     @classmethod
 180     def working(cls):
 181         """Getter method for _WORKING."""
 182         return cls._WORKING
 183
 184     def initialize(self):
 185         """Initializes an instance (authentication, etc)."""
 186         if not self._ready:
 187             self._real_initialize()
 188             self._ready = True
 189
 190     def extract(self, url):
 191         """Extracts URL information and returns it in list of dicts."""
 192         self.initialize()
 193         return self._real_extract(url)
 194
 195     def set_downloader(self, downloader):
 196         """Sets the downloader for this IE."""
 197         self._downloader = downloader
 198
 199     def _real_initialize(self):
 200         """Real initialization process. Redefine in subclasses."""
 201         pass
 202
 203     def _real_extract(self, url):
 204         """Real extraction process. Redefine in subclasses."""
 205         pass
 206
 207     @classmethod
 208     def ie_key(cls):
 209         """A string for getting the InfoExtractor with get_info_extractor"""
 210         return cls.__name__[:-2]
 211
 212     @property
 213     def IE_NAME(self):
 214         return type(self).__name__[:-2]
 215
 216     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 217         """ Returns the response handle """
 218         if note is None:
 219             self.report_download_webpage(video_id)
 220         elif note is not False:
 221             if video_id is None:
 222                 self.to_screen('%s' % (note,))
 223             else:
 224                 self.to_screen('%s: %s' % (video_id, note))
 225         try:
 226             return self._downloader.urlopen(url_or_request)
 227         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 228             if errnote is False:
 229                 return False
 230             if errnote is None:
 231                 errnote = 'Unable to download webpage'
 232             errmsg = '%s: %s' % (errnote, compat_str(err))
 233             if fatal:
 234                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 235             else:
 236                 self._downloader.report_warning(errmsg)
 237                 return False
 238
 239     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 240         """ Returns a tuple (page content as string, URL handle) """
 241
 242         # Strip hashes from the URL (#1038)
 243         if isinstance(url_or_request, (compat_str, str)):
 244             url_or_request = url_or_request.partition('#')[0]
 245
 246         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 247         if urlh is False:
 248             assert not fatal
 249             return False
 250         content_type = urlh.headers.get('Content-Type', '')
 251         webpage_bytes = urlh.read()
 252         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 253         if m:
 254             encoding = m.group(1)
 255         else:
 256             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 257                           webpage_bytes[:1024])
 258             if m:
 259                 encoding = m.group(1).decode('ascii')
 260             elif webpage_bytes.startswith(b'\xff\xfe'):
 261                 encoding = 'utf-16'
 262             else:
 263                 encoding = 'utf-8'
 264         if self._downloader.params.get('dump_intermediate_pages', False):
 265             try:
 266                 url = url_or_request.get_full_url()
 267             except AttributeError:
 268                 url = url_or_request
 269             self.to_screen('Dumping request to ' + url)
 270             dump = base64.b64encode(webpage_bytes).decode('ascii')
 271             self._downloader.to_screen(dump)
 272         if self._downloader.params.get('write_pages', False):
 273             try:
 274                 url = url_or_request.get_full_url()
 275             except AttributeError:
 276                 url = url_or_request
 277             basen = '%s_%s' % (video_id, url)
 278             if len(basen) > 240:
 279                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 280                 basen = basen[:240 - len(h)] + h
 281             raw_filename = basen + '.dump'
 282             filename = sanitize_filename(raw_filename, restricted=True)
 283             self.to_screen('Saving request to ' + filename)
 284             # Working around MAX_PATH limitation on Windows (see
 285             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 286             if os.name == 'nt':
 287                 absfilepath = os.path.abspath(filename)
 288                 if len(absfilepath) > 259:
 289                     filename = '\\\\?\\' + absfilepath
 290             with open(filename, 'wb') as outf:
 291                 outf.write(webpage_bytes)
 292
 293         try:
 294             content = webpage_bytes.decode(encoding, 'replace')
 295         except LookupError:
 296             content = webpage_bytes.decode('utf-8', 'replace')
 297
 298         if ('<title>Access to this site is blocked</title>' in content and
 299                 'Websense' in content[:512]):
 300             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 301             blocked_iframe = self._html_search_regex(
 302                 r'<iframe src="([^"]+)"', content,
 303                 'Websense information URL', default=None)
 304             if blocked_iframe:
 305                 msg += ' Visit %s for more details' % blocked_iframe
 306             raise ExtractorError(msg, expected=True)
 307
 308         return (content, urlh)
 309
 310     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 311         """ Returns the data of the page as a string """
 312         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 313         if res is False:
 314             return res
 315         else:
 316             content, _ = res
 317             return content
 318
 319     def _download_xml(self, url_or_request, video_id,
 320                       note='Downloading XML', errnote='Unable to download XML',
 321                       transform_source=None, fatal=True):
 322         """Return the xml as an xml.etree.ElementTree.Element"""
 323         xml_string = self._download_webpage(
 324             url_or_request, video_id, note, errnote, fatal=fatal)
 325         if xml_string is False:
 326             return xml_string
 327         if transform_source:
 328             xml_string = transform_source(xml_string)
 329         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 330
 331     def _download_json(self, url_or_request, video_id,
 332                        note='Downloading JSON metadata',
 333                        errnote='Unable to download JSON metadata',
 334                        transform_source=None,
 335                        fatal=True):
 336         json_string = self._download_webpage(
 337             url_or_request, video_id, note, errnote, fatal=fatal)
 338         if (not fatal) and json_string is False:
 339             return None
 340         if transform_source:
 341             json_string = transform_source(json_string)
 342         try:
 343             return json.loads(json_string)
 344         except ValueError as ve:
 345             errmsg = '%s: Failed to parse JSON ' % video_id
 346             if fatal:
 347                 raise ExtractorError(errmsg, cause=ve)
 348             else:
 349                 self.report_warning(errmsg + str(ve))
 350
 351     def report_warning(self, msg, video_id=None):
 352         idstr = '' if video_id is None else '%s: ' % video_id
 353         self._downloader.report_warning(
 354             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 355
 356     def to_screen(self, msg):
 357         """Print msg to screen, prefixing it with '[ie_name]'"""
 358         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 359
 360     def report_extraction(self, id_or_name):
 361         """Report information extraction."""
 362         self.to_screen('%s: Extracting information' % id_or_name)
 363
 364     def report_download_webpage(self, video_id):
 365         """Report webpage download."""
 366         self.to_screen('%s: Downloading webpage' % video_id)
 367
 368     def report_age_confirmation(self):
 369         """Report attempt to confirm age."""
 370         self.to_screen('Confirming age')
 371
 372     def report_login(self):
 373         """Report attempt to log in."""
 374         self.to_screen('Logging in')
 375
 376     #Methods for following #608
 377     @staticmethod
 378     def url_result(url, ie=None, video_id=None):
 379         """Returns a url that points to a page that should be processed"""
 380         #TODO: ie should be the class used for getting the info
 381         video_info = {'_type': 'url',
 382                       'url': url,
 383                       'ie_key': ie}
 384         if video_id is not None:
 385             video_info['id'] = video_id
 386         return video_info
 387     @staticmethod
 388     def playlist_result(entries, playlist_id=None, playlist_title=None):
 389         """Returns a playlist"""
 390         video_info = {'_type': 'playlist',
 391                       'entries': entries}
 392         if playlist_id:
 393             video_info['id'] = playlist_id
 394         if playlist_title:
 395             video_info['title'] = playlist_title
 396         return video_info
 397
 398     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 399         """
 400         Perform a regex search on the given string, using a single or a list of
 401         patterns returning the first matching group.
 402         In case of failure return a default value or raise a WARNING or a
 403         RegexNotFoundError, depending on fatal, specifying the field name.
 404         """
 405         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 406             mobj = re.search(pattern, string, flags)
 407         else:
 408             for p in pattern:
 409                 mobj = re.search(p, string, flags)
 410                 if mobj:
 411                     break
 412
 413         if os.name != 'nt' and sys.stderr.isatty():
 414             _name = '\033[0;34m%s\033[0m' % name
 415         else:
 416             _name = name
 417
 418         if mobj:
 419             # return the first matching group
 420             return next(g for g in mobj.groups() if g is not None)
 421         elif default is not _NO_DEFAULT:
 422             return default
 423         elif fatal:
 424             raise RegexNotFoundError('Unable to extract %s' % _name)
 425         else:
 426             self._downloader.report_warning('unable to extract %s; '
 427                 'please report this issue on http://yt-dl.org/bug' % _name)
 428             return None
 429
 430     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 431         """
 432         Like _search_regex, but strips HTML tags and unescapes entities.
 433         """
 434         res = self._search_regex(pattern, string, name, default, fatal, flags)
 435         if res:
 436             return clean_html(res).strip()
 437         else:
 438             return res
 439
 440     def _get_login_info(self):
 441         """
 442         Get the the login info as (username, password)
 443         It will look in the netrc file using the _NETRC_MACHINE value
 444         If there's no info available, return (None, None)
 445         """
 446         if self._downloader is None:
 447             return (None, None)
 448
 449         username = None
 450         password = None
 451         downloader_params = self._downloader.params
 452
 453         # Attempt to use provided username and password or .netrc data
 454         if downloader_params.get('username', None) is not None:
 455             username = downloader_params['username']
 456             password = downloader_params['password']
 457         elif downloader_params.get('usenetrc', False):
 458             try:
 459                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 460                 if info is not None:
 461                     username = info[0]
 462                     password = info[2]
 463                 else:
 464                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 465             except (IOError, netrc.NetrcParseError) as err:
 466                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 467
 468         return (username, password)
 469
 470     def _get_tfa_info(self):
 471         """
 472         Get the two-factor authentication info
 473         TODO - asking the user will be required for sms/phone verify
 474         currently just uses the command line option
 475         If there's no info available, return None
 476         """
 477         if self._downloader is None:
 478             return None
 479         downloader_params = self._downloader.params
 480
 481         if downloader_params.get('twofactor', None) is not None:
 482             return downloader_params['twofactor']
 483
 484         return None
 485
 486     # Helper functions for extracting OpenGraph info
 487     @staticmethod
 488     def _og_regexes(prop):
 489         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 490         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 491         template = r'<meta[^>]+?%s[^>]+?%s'
 492         return [
 493             template % (property_re, content_re),
 494             template % (content_re, property_re),
 495         ]
 496
 497     def _og_search_property(self, prop, html, name=None, **kargs):
 498         if name is None:
 499             name = 'OpenGraph %s' % prop
 500         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 501         if escaped is None:
 502             return None
 503         return unescapeHTML(escaped)
 504
 505     def _og_search_thumbnail(self, html, **kargs):
 506         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 507
 508     def _og_search_description(self, html, **kargs):
 509         return self._og_search_property('description', html, fatal=False, **kargs)
 510
 511     def _og_search_title(self, html, **kargs):
 512         return self._og_search_property('title', html, **kargs)
 513
 514     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 515         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 516         if secure:
 517             regexes = self._og_regexes('video:secure_url') + regexes
 518         return self._html_search_regex(regexes, html, name, **kargs)
 519
 520     def _og_search_url(self, html, **kargs):
 521         return self._og_search_property('url', html, **kargs)
 522
 523     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 524         if display_name is None:
 525             display_name = name
 526         return self._html_search_regex(
 527             r'''(?ix)<meta
 528                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 529                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 530             html, display_name, fatal=fatal, **kwargs)
 531
 532     def _dc_search_uploader(self, html):
 533         return self._html_search_meta('dc.creator', html, 'uploader')
 534
 535     def _rta_search(self, html):
 536         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 537         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 538                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 539                      html):
 540             return 18
 541         return 0
 542
 543     def _media_rating_search(self, html):
 544         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 545         rating = self._html_search_meta('rating', html)
 546
 547         if not rating:
 548             return None
 549
 550         RATING_TABLE = {
 551             'safe for kids': 0,
 552             'general': 8,
 553             '14 years': 14,
 554             'mature': 17,
 555             'restricted': 19,
 556         }
 557         return RATING_TABLE.get(rating.lower(), None)
 558
 559     def _twitter_search_player(self, html):
 560         return self._html_search_meta('twitter:player', html,
 561             'twitter card player')
 562
 563     def _sort_formats(self, formats):
 564         if not formats:
 565             raise ExtractorError('No video formats found')
 566
 567         def _formats_key(f):
 568             # TODO remove the following workaround
 569             from ..utils import determine_ext
 570             if not f.get('ext') and 'url' in f:
 571                 f['ext'] = determine_ext(f['url'])
 572
 573             preference = f.get('preference')
 574             if preference is None:
 575                 proto = f.get('protocol')
 576                 if proto is None:
 577                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 578
 579                 preference = 0 if proto in ['http', 'https'] else -0.1
 580                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 581                     preference -= 0.5
 582
 583             if f.get('vcodec') == 'none':  # audio only
 584                 if self._downloader.params.get('prefer_free_formats'):
 585                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 586                 else:
 587                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 588                 ext_preference = 0
 589                 try:
 590                     audio_ext_preference = ORDER.index(f['ext'])
 591                 except ValueError:
 592                     audio_ext_preference = -1
 593             else:
 594                 if self._downloader.params.get('prefer_free_formats'):
 595                     ORDER = ['flv', 'mp4', 'webm']
 596                 else:
 597                     ORDER = ['webm', 'flv', 'mp4']
 598                 try:
 599                     ext_preference = ORDER.index(f['ext'])
 600                 except ValueError:
 601                     ext_preference = -1
 602                 audio_ext_preference = 0
 603
 604             return (
 605                 preference,
 606                 f.get('quality') if f.get('quality') is not None else -1,
 607                 f.get('height') if f.get('height') is not None else -1,
 608                 f.get('width') if f.get('width') is not None else -1,
 609                 ext_preference,
 610                 f.get('tbr') if f.get('tbr') is not None else -1,
 611                 f.get('vbr') if f.get('vbr') is not None else -1,
 612                 f.get('abr') if f.get('abr') is not None else -1,
 613                 audio_ext_preference,
 614                 f.get('filesize') if f.get('filesize') is not None else -1,
 615                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 616                 f.get('format_id'),
 617             )
 618         formats.sort(key=_formats_key)
 619
 620     def http_scheme(self):
 621         """ Either "http:" or "https:", depending on the user's preferences """
 622         return (
 623             'http:'
 624             if self._downloader.params.get('prefer_insecure', False)
 625             else 'https:')
 626
 627     def _proto_relative_url(self, url, scheme=None):
 628         if url is None:
 629             return url
 630         if url.startswith('//'):
 631             if scheme is None:
 632                 scheme = self.http_scheme()
 633             return scheme + url
 634         else:
 635             return url
 636
 637     def _sleep(self, timeout, video_id, msg_template=None):
 638         if msg_template is None:
 639             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 640         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 641         self.to_screen(msg)
 642         time.sleep(timeout)
 643
 644     def _extract_f4m_formats(self, manifest_url, video_id):
 645         manifest = self._download_xml(
 646             manifest_url, video_id, 'Downloading f4m manifest',
 647             'Unable to download f4m manifest')
 648
 649         formats = []
 650         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 651         for i, media_el in enumerate(media_nodes):
 652             tbr = int_or_none(media_el.attrib.get('bitrate'))
 653             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 654             formats.append({
 655                 'format_id': format_id,
 656                 'url': manifest_url,
 657                 'ext': 'flv',
 658                 'tbr': tbr,
 659                 'width': int_or_none(media_el.attrib.get('width')),
 660                 'height': int_or_none(media_el.attrib.get('height')),
 661             })
 662         self._sort_formats(formats)
 663
 664         return formats
 665
 666     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 667                               entry_protocol='m3u8', preference=None):
 668
 669         formats = [{
 670             'format_id': 'm3u8-meta',
 671             'url': m3u8_url,
 672             'ext': ext,
 673             'protocol': 'm3u8',
 674             'preference': -1,
 675             'resolution': 'multiple',
 676             'format_note': 'Quality selection URL',
 677         }]
 678
 679         format_url = lambda u: (
 680             u
 681             if re.match(r'^https?://', u)
 682             else compat_urlparse.urljoin(m3u8_url, u))
 683
 684         m3u8_doc = self._download_webpage(m3u8_url, video_id)
 685         last_info = None
 686         kv_rex = re.compile(
 687             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 688         for line in m3u8_doc.splitlines():
 689             if line.startswith('#EXT-X-STREAM-INF:'):
 690                 last_info = {}
 691                 for m in kv_rex.finditer(line):
 692                     v = m.group('val')
 693                     if v.startswith('"'):
 694                         v = v[1:-1]
 695                     last_info[m.group('key')] = v
 696             elif line.startswith('#') or not line.strip():
 697                 continue
 698             else:
 699                 if last_info is None:
 700                     formats.append({'url': format_url(line)})
 701                     continue
 702                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 703
 704                 f = {
 705                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 706                     'url': format_url(line.strip()),
 707                     'tbr': tbr,
 708                     'ext': ext,
 709                     'protocol': entry_protocol,
 710                     'preference': preference,
 711                 }
 712                 codecs = last_info.get('CODECS')
 713                 if codecs:
 714                     # TODO: looks like video codec is not always necessarily goes first
 715                     va_codecs = codecs.split(',')
 716                     if va_codecs[0]:
 717                         f['vcodec'] = va_codecs[0].partition('.')[0]
 718                     if len(va_codecs) > 1 and va_codecs[1]:
 719                         f['acodec'] = va_codecs[1].partition('.')[0]
 720                 resolution = last_info.get('RESOLUTION')
 721                 if resolution:
 722                     width_str, height_str = resolution.split('x')
 723                     f['width'] = int(width_str)
 724                     f['height'] = int(height_str)
 725                 formats.append(f)
 726                 last_info = {}
 727         self._sort_formats(formats)
 728         return formats
 729
 730     def _live_title(self, name):
 731         """ Generate the title for a live video """
 732         now = datetime.datetime.now()
 733         now_str = now.strftime("%Y-%m-%d %H:%M")
 734         return name + ' ' + now_str
 735
 736     def _int(self, v, name, fatal=False, **kwargs):
 737         res = int_or_none(v, **kwargs)
 738         if 'get_attr' in kwargs:
 739             print(getattr(v, kwargs['get_attr']))
 740         if res is None:
 741             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 742             if fatal:
 743                 raise ExtractorError(msg)
 744             else:
 745                 self._downloader.report_warning(msg)
 746         return res
 747
 748     def _float(self, v, name, fatal=False, **kwargs):
 749         res = float_or_none(v, **kwargs)
 750         if res is None:
 751             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 752             if fatal:
 753                 raise ExtractorError(msg)
 754             else:
 755                 self._downloader.report_warning(msg)
 756         return res
 757
 758
 759 class SearchInfoExtractor(InfoExtractor):
 760     """
 761     Base class for paged search queries extractors.
 762     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 763     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 764     """
 765
 766     @classmethod
 767     def _make_valid_url(cls):
 768         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 769
 770     @classmethod
 771     def suitable(cls, url):
 772         return re.match(cls._make_valid_url(), url) is not None
 773
 774     def _real_extract(self, query):
 775         mobj = re.match(self._make_valid_url(), query)
 776         if mobj is None:
 777             raise ExtractorError('Invalid search query "%s"' % query)
 778
 779         prefix = mobj.group('prefix')
 780         query = mobj.group('query')
 781         if prefix == '':
 782             return self._get_n_results(query, 1)
 783         elif prefix == 'all':
 784             return self._get_n_results(query, self._MAX_RESULTS)
 785         else:
 786             n = int(prefix)
 787             if n <= 0:
 788                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 789             elif n > self._MAX_RESULTS:
 790                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 791                 n = self._MAX_RESULTS
 792             return self._get_n_results(query, n)
 793
 794     def _get_n_results(self, query, n):
 795         """Get a specified number of results for a query"""
 796         raise NotImplementedError("This method must be implemented by subclasses")
 797
 798     @property
 799     def SEARCH_KEY(self):
 800         return self._SEARCH_KEY