_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..utils import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21
  22     clean_html,
  23     compiled_regex_type,
  24     ExtractorError,
  25     float_or_none,
  26     int_or_none,
  27     RegexNotFoundError,
  28     sanitize_filename,
  29     unescapeHTML,
  30 )
  31 _NO_DEFAULT = object()
  32
  33
  34 class InfoExtractor(object):
  35     """Information Extractor class.
  36
  37     Information extractors are the classes that, given a URL, extract
  38     information about the video (or videos) the URL refers to. This
  39     information includes the real video URL, the video title, author and
  40     others. The information is stored in a dictionary which is then
  41     passed to the FileDownloader. The FileDownloader processes this
  42     information possibly downloading the video to the file system, among
  43     other possible outcomes.
  44
  45     The dictionaries must include the following fields:
  46
  47     id:             Video identifier.
  48     title:          Video title, unescaped.
  49
  50     Additionally, it must contain either a formats entry or a url one:
  51
  52     formats:        A list of dictionaries for each format available, ordered
  53                     from worst to best quality.
  54
  55                     Potential fields:
  56                     * url        Mandatory. The URL of the video file
  57                     * ext        Will be calculated from url if missing
  58                     * format     A human-readable description of the format
  59                                  ("mp4 container with h264/opus").
  60                                  Calculated from the format_id, width, height.
  61                                  and format_note fields if missing.
  62                     * format_id  A short description of the format
  63                                  ("mp4_h264_opus" or "19").
  64                                 Technically optional, but strongly recommended.
  65                     * format_note Additional info about the format
  66                                  ("3D" or "DASH video")
  67                     * width      Width of the video, if known
  68                     * height     Height of the video, if known
  69                     * resolution Textual description of width and height
  70                     * tbr        Average bitrate of audio and video in KBit/s
  71                     * abr        Average audio bitrate in KBit/s
  72                     * acodec     Name of the audio codec in use
  73                     * asr        Audio sampling rate in Hertz
  74                     * vbr        Average video bitrate in KBit/s
  75                     * vcodec     Name of the video codec in use
  76                     * container  Name of the container format
  77                     * filesize   The number of bytes, if known in advance
  78                     * filesize_approx  An estimate for the number of bytes
  79                     * player_url SWF Player URL (used for rtmpdump).
  80                     * protocol   The protocol that will be used for the actual
  81                                  download, lower-case.
  82                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  83                     * preference Order number of this format. If this field is
  84                                  present and not None, the formats get sorted
  85                                  by this field, regardless of all other values.
  86                                  -1 for default (order by other properties),
  87                                  -2 or smaller for less than default.
  88                     * quality    Order number of the video quality of this
  89                                  format, irrespective of the file format.
  90                                  -1 for default (order by other properties),
  91                                  -2 or smaller for less than default.
  92                     * http_referer  HTTP Referer header value to set.
  93                     * http_method  HTTP method to use for the download.
  94                     * http_headers  A dictionary of additional HTTP headers
  95                                  to add to the request.
  96                     * http_post_data  Additional data to send with a POST
  97                                  request.
  98     url:            Final video URL.
  99     ext:            Video filename extension.
 100     format:         The video format, defaults to ext (used for --get-format)
 101     player_url:     SWF Player URL (used for rtmpdump).
 102
 103     The following fields are optional:
 104
 105     display_id      An alternative identifier for the video, not necessarily
 106                     unique, but available before title. Typically, id is
 107                     something like "4234987", title "Dancing naked mole rats",
 108                     and display_id "dancing-naked-mole-rats"
 109     thumbnails:     A list of dictionaries, with the following entries:
 110                         * "url"
 111                         * "width" (optional, int)
 112                         * "height" (optional, int)
 113                         * "resolution" (optional, string "{width}x{height"},
 114                                         deprecated)
 115     thumbnail:      Full URL to a video thumbnail image.
 116     description:    One-line video description.
 117     uploader:       Full name of the video uploader.
 118     timestamp:      UNIX timestamp of the moment the video became available.
 119     upload_date:    Video upload date (YYYYMMDD).
 120                     If not explicitly set, calculated from timestamp.
 121     uploader_id:    Nickname or id of the video uploader.
 122     location:       Physical location where the video was filmed.
 123     subtitles:      The subtitle file contents as a dictionary in the format
 124                     {language: subtitles}.
 125     duration:       Length of the video in seconds, as an integer.
 126     view_count:     How many users have watched the video on the platform.
 127     like_count:     Number of positive ratings of the video
 128     dislike_count:  Number of negative ratings of the video
 129     comment_count:  Number of comments on the video
 130     age_limit:      Age restriction for the video, as an integer (years)
 131     webpage_url:    The url to the video webpage, if given to youtube-dl it
 132                     should allow to get the same result again. (It will be set
 133                     by YoutubeDL if it's missing)
 134     categories:     A list of categories that the video falls in, for example
 135                     ["Sports", "Berlin"]
 136     is_live:        True, False, or None (=unknown). Whether this video is a
 137                     live stream that goes on instead of a fixed-length video.
 138
 139     Unless mentioned otherwise, the fields should be Unicode strings.
 140
 141     Subclasses of this one should re-define the _real_initialize() and
 142     _real_extract() methods and define a _VALID_URL regexp.
 143     Probably, they should also be added to the list of extractors.
 144
 145     Finally, the _WORKING attribute should be set to False for broken IEs
 146     in order to warn the users and skip the tests.
 147     """
 148
 149     _ready = False
 150     _downloader = None
 151     _WORKING = True
 152
 153     def __init__(self, downloader=None):
 154         """Constructor. Receives an optional downloader."""
 155         self._ready = False
 156         self.set_downloader(downloader)
 157
 158     @classmethod
 159     def suitable(cls, url):
 160         """Receives a URL and returns True if suitable for this IE."""
 161
 162         # This does not use has/getattr intentionally - we want to know whether
 163         # we have cached the regexp for *this* class, whereas getattr would also
 164         # match the superclass
 165         if '_VALID_URL_RE' not in cls.__dict__:
 166             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 167         return cls._VALID_URL_RE.match(url) is not None
 168
 169     @classmethod
 170     def _match_id(cls, url):
 171         if '_VALID_URL_RE' not in cls.__dict__:
 172             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 173         m = cls._VALID_URL_RE.match(url)
 174         assert m
 175         return m.group('id')
 176
 177     @classmethod
 178     def working(cls):
 179         """Getter method for _WORKING."""
 180         return cls._WORKING
 181
 182     def initialize(self):
 183         """Initializes an instance (authentication, etc)."""
 184         if not self._ready:
 185             self._real_initialize()
 186             self._ready = True
 187
 188     def extract(self, url):
 189         """Extracts URL information and returns it in list of dicts."""
 190         self.initialize()
 191         return self._real_extract(url)
 192
 193     def set_downloader(self, downloader):
 194         """Sets the downloader for this IE."""
 195         self._downloader = downloader
 196
 197     def _real_initialize(self):
 198         """Real initialization process. Redefine in subclasses."""
 199         pass
 200
 201     def _real_extract(self, url):
 202         """Real extraction process. Redefine in subclasses."""
 203         pass
 204
 205     @classmethod
 206     def ie_key(cls):
 207         """A string for getting the InfoExtractor with get_info_extractor"""
 208         return cls.__name__[:-2]
 209
 210     @property
 211     def IE_NAME(self):
 212         return type(self).__name__[:-2]
 213
 214     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 215         """ Returns the response handle """
 216         if note is None:
 217             self.report_download_webpage(video_id)
 218         elif note is not False:
 219             if video_id is None:
 220                 self.to_screen('%s' % (note,))
 221             else:
 222                 self.to_screen('%s: %s' % (video_id, note))
 223         try:
 224             return self._downloader.urlopen(url_or_request)
 225         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 226             if errnote is False:
 227                 return False
 228             if errnote is None:
 229                 errnote = 'Unable to download webpage'
 230             errmsg = '%s: %s' % (errnote, compat_str(err))
 231             if fatal:
 232                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 233             else:
 234                 self._downloader.report_warning(errmsg)
 235                 return False
 236
 237     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 238         """ Returns a tuple (page content as string, URL handle) """
 239
 240         # Strip hashes from the URL (#1038)
 241         if isinstance(url_or_request, (compat_str, str)):
 242             url_or_request = url_or_request.partition('#')[0]
 243
 244         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 245         if urlh is False:
 246             assert not fatal
 247             return False
 248         content_type = urlh.headers.get('Content-Type', '')
 249         webpage_bytes = urlh.read()
 250         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 251         if m:
 252             encoding = m.group(1)
 253         else:
 254             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 255                           webpage_bytes[:1024])
 256             if m:
 257                 encoding = m.group(1).decode('ascii')
 258             elif webpage_bytes.startswith(b'\xff\xfe'):
 259                 encoding = 'utf-16'
 260             else:
 261                 encoding = 'utf-8'
 262         if self._downloader.params.get('dump_intermediate_pages', False):
 263             try:
 264                 url = url_or_request.get_full_url()
 265             except AttributeError:
 266                 url = url_or_request
 267             self.to_screen('Dumping request to ' + url)
 268             dump = base64.b64encode(webpage_bytes).decode('ascii')
 269             self._downloader.to_screen(dump)
 270         if self._downloader.params.get('write_pages', False):
 271             try:
 272                 url = url_or_request.get_full_url()
 273             except AttributeError:
 274                 url = url_or_request
 275             basen = '%s_%s' % (video_id, url)
 276             if len(basen) > 240:
 277                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 278                 basen = basen[:240 - len(h)] + h
 279             raw_filename = basen + '.dump'
 280             filename = sanitize_filename(raw_filename, restricted=True)
 281             self.to_screen('Saving request to ' + filename)
 282             with open(filename, 'wb') as outf:
 283                 outf.write(webpage_bytes)
 284
 285         try:
 286             content = webpage_bytes.decode(encoding, 'replace')
 287         except LookupError:
 288             content = webpage_bytes.decode('utf-8', 'replace')
 289
 290         if ('<title>Access to this site is blocked</title>' in content and
 291                 'Websense' in content[:512]):
 292             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 293             blocked_iframe = self._html_search_regex(
 294                 r'<iframe src="([^"]+)"', content,
 295                 'Websense information URL', default=None)
 296             if blocked_iframe:
 297                 msg += ' Visit %s for more details' % blocked_iframe
 298             raise ExtractorError(msg, expected=True)
 299
 300         return (content, urlh)
 301
 302     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 303         """ Returns the data of the page as a string """
 304         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 305         if res is False:
 306             return res
 307         else:
 308             content, _ = res
 309             return content
 310
 311     def _download_xml(self, url_or_request, video_id,
 312                       note='Downloading XML', errnote='Unable to download XML',
 313                       transform_source=None, fatal=True):
 314         """Return the xml as an xml.etree.ElementTree.Element"""
 315         xml_string = self._download_webpage(
 316             url_or_request, video_id, note, errnote, fatal=fatal)
 317         if xml_string is False:
 318             return xml_string
 319         if transform_source:
 320             xml_string = transform_source(xml_string)
 321         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 322
 323     def _download_json(self, url_or_request, video_id,
 324                        note='Downloading JSON metadata',
 325                        errnote='Unable to download JSON metadata',
 326                        transform_source=None,
 327                        fatal=True):
 328         json_string = self._download_webpage(
 329             url_or_request, video_id, note, errnote, fatal=fatal)
 330         if (not fatal) and json_string is False:
 331             return None
 332         if transform_source:
 333             json_string = transform_source(json_string)
 334         try:
 335             return json.loads(json_string)
 336         except ValueError as ve:
 337             errmsg = '%s: Failed to parse JSON ' % video_id
 338             if fatal:
 339                 raise ExtractorError(errmsg, cause=ve)
 340             else:
 341                 self.report_warning(errmsg + str(ve))
 342
 343     def report_warning(self, msg, video_id=None):
 344         idstr = '' if video_id is None else '%s: ' % video_id
 345         self._downloader.report_warning(
 346             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 347
 348     def to_screen(self, msg):
 349         """Print msg to screen, prefixing it with '[ie_name]'"""
 350         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 351
 352     def report_extraction(self, id_or_name):
 353         """Report information extraction."""
 354         self.to_screen('%s: Extracting information' % id_or_name)
 355
 356     def report_download_webpage(self, video_id):
 357         """Report webpage download."""
 358         self.to_screen('%s: Downloading webpage' % video_id)
 359
 360     def report_age_confirmation(self):
 361         """Report attempt to confirm age."""
 362         self.to_screen('Confirming age')
 363
 364     def report_login(self):
 365         """Report attempt to log in."""
 366         self.to_screen('Logging in')
 367
 368     #Methods for following #608
 369     @staticmethod
 370     def url_result(url, ie=None, video_id=None):
 371         """Returns a url that points to a page that should be processed"""
 372         #TODO: ie should be the class used for getting the info
 373         video_info = {'_type': 'url',
 374                       'url': url,
 375                       'ie_key': ie}
 376         if video_id is not None:
 377             video_info['id'] = video_id
 378         return video_info
 379     @staticmethod
 380     def playlist_result(entries, playlist_id=None, playlist_title=None):
 381         """Returns a playlist"""
 382         video_info = {'_type': 'playlist',
 383                       'entries': entries}
 384         if playlist_id:
 385             video_info['id'] = playlist_id
 386         if playlist_title:
 387             video_info['title'] = playlist_title
 388         return video_info
 389
 390     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 391         """
 392         Perform a regex search on the given string, using a single or a list of
 393         patterns returning the first matching group.
 394         In case of failure return a default value or raise a WARNING or a
 395         RegexNotFoundError, depending on fatal, specifying the field name.
 396         """
 397         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 398             mobj = re.search(pattern, string, flags)
 399         else:
 400             for p in pattern:
 401                 mobj = re.search(p, string, flags)
 402                 if mobj:
 403                     break
 404
 405         if os.name != 'nt' and sys.stderr.isatty():
 406             _name = '\033[0;34m%s\033[0m' % name
 407         else:
 408             _name = name
 409
 410         if mobj:
 411             # return the first matching group
 412             return next(g for g in mobj.groups() if g is not None)
 413         elif default is not _NO_DEFAULT:
 414             return default
 415         elif fatal:
 416             raise RegexNotFoundError('Unable to extract %s' % _name)
 417         else:
 418             self._downloader.report_warning('unable to extract %s; '
 419                 'please report this issue on http://yt-dl.org/bug' % _name)
 420             return None
 421
 422     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 423         """
 424         Like _search_regex, but strips HTML tags and unescapes entities.
 425         """
 426         res = self._search_regex(pattern, string, name, default, fatal, flags)
 427         if res:
 428             return clean_html(res).strip()
 429         else:
 430             return res
 431
 432     def _get_login_info(self):
 433         """
 434         Get the the login info as (username, password)
 435         It will look in the netrc file using the _NETRC_MACHINE value
 436         If there's no info available, return (None, None)
 437         """
 438         if self._downloader is None:
 439             return (None, None)
 440
 441         username = None
 442         password = None
 443         downloader_params = self._downloader.params
 444
 445         # Attempt to use provided username and password or .netrc data
 446         if downloader_params.get('username', None) is not None:
 447             username = downloader_params['username']
 448             password = downloader_params['password']
 449         elif downloader_params.get('usenetrc', False):
 450             try:
 451                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 452                 if info is not None:
 453                     username = info[0]
 454                     password = info[2]
 455                 else:
 456                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 457             except (IOError, netrc.NetrcParseError) as err:
 458                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 459
 460         return (username, password)
 461
 462     def _get_tfa_info(self):
 463         """
 464         Get the two-factor authentication info
 465         TODO - asking the user will be required for sms/phone verify
 466         currently just uses the command line option
 467         If there's no info available, return None
 468         """
 469         if self._downloader is None:
 470             return None
 471         downloader_params = self._downloader.params
 472
 473         if downloader_params.get('twofactor', None) is not None:
 474             return downloader_params['twofactor']
 475
 476         return None
 477
 478     # Helper functions for extracting OpenGraph info
 479     @staticmethod
 480     def _og_regexes(prop):
 481         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 482         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 483         template = r'<meta[^>]+?%s[^>]+?%s'
 484         return [
 485             template % (property_re, content_re),
 486             template % (content_re, property_re),
 487         ]
 488
 489     def _og_search_property(self, prop, html, name=None, **kargs):
 490         if name is None:
 491             name = 'OpenGraph %s' % prop
 492         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 493         if escaped is None:
 494             return None
 495         return unescapeHTML(escaped)
 496
 497     def _og_search_thumbnail(self, html, **kargs):
 498         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 499
 500     def _og_search_description(self, html, **kargs):
 501         return self._og_search_property('description', html, fatal=False, **kargs)
 502
 503     def _og_search_title(self, html, **kargs):
 504         return self._og_search_property('title', html, **kargs)
 505
 506     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 507         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 508         if secure:
 509             regexes = self._og_regexes('video:secure_url') + regexes
 510         return self._html_search_regex(regexes, html, name, **kargs)
 511
 512     def _og_search_url(self, html, **kargs):
 513         return self._og_search_property('url', html, **kargs)
 514
 515     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 516         if display_name is None:
 517             display_name = name
 518         return self._html_search_regex(
 519             r'''(?ix)<meta
 520                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 521                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 522             html, display_name, fatal=fatal, **kwargs)
 523
 524     def _dc_search_uploader(self, html):
 525         return self._html_search_meta('dc.creator', html, 'uploader')
 526
 527     def _rta_search(self, html):
 528         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 529         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 530                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 531                      html):
 532             return 18
 533         return 0
 534
 535     def _media_rating_search(self, html):
 536         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 537         rating = self._html_search_meta('rating', html)
 538
 539         if not rating:
 540             return None
 541
 542         RATING_TABLE = {
 543             'safe for kids': 0,
 544             'general': 8,
 545             '14 years': 14,
 546             'mature': 17,
 547             'restricted': 19,
 548         }
 549         return RATING_TABLE.get(rating.lower(), None)
 550
 551     def _twitter_search_player(self, html):
 552         return self._html_search_meta('twitter:player', html,
 553             'twitter card player')
 554
 555     def _sort_formats(self, formats):
 556         if not formats:
 557             raise ExtractorError('No video formats found')
 558
 559         def _formats_key(f):
 560             # TODO remove the following workaround
 561             from ..utils import determine_ext
 562             if not f.get('ext') and 'url' in f:
 563                 f['ext'] = determine_ext(f['url'])
 564
 565             preference = f.get('preference')
 566             if preference is None:
 567                 proto = f.get('protocol')
 568                 if proto is None:
 569                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 570
 571                 preference = 0 if proto in ['http', 'https'] else -0.1
 572                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 573                     preference -= 0.5
 574
 575             if f.get('vcodec') == 'none':  # audio only
 576                 if self._downloader.params.get('prefer_free_formats'):
 577                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 578                 else:
 579                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 580                 ext_preference = 0
 581                 try:
 582                     audio_ext_preference = ORDER.index(f['ext'])
 583                 except ValueError:
 584                     audio_ext_preference = -1
 585             else:
 586                 if self._downloader.params.get('prefer_free_formats'):
 587                     ORDER = ['flv', 'mp4', 'webm']
 588                 else:
 589                     ORDER = ['webm', 'flv', 'mp4']
 590                 try:
 591                     ext_preference = ORDER.index(f['ext'])
 592                 except ValueError:
 593                     ext_preference = -1
 594                 audio_ext_preference = 0
 595
 596             return (
 597                 preference,
 598                 f.get('quality') if f.get('quality') is not None else -1,
 599                 f.get('height') if f.get('height') is not None else -1,
 600                 f.get('width') if f.get('width') is not None else -1,
 601                 ext_preference,
 602                 f.get('tbr') if f.get('tbr') is not None else -1,
 603                 f.get('vbr') if f.get('vbr') is not None else -1,
 604                 f.get('abr') if f.get('abr') is not None else -1,
 605                 audio_ext_preference,
 606                 f.get('filesize') if f.get('filesize') is not None else -1,
 607                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 608                 f.get('format_id'),
 609             )
 610         formats.sort(key=_formats_key)
 611
 612     def http_scheme(self):
 613         """ Either "https:" or "https:", depending on the user's preferences """
 614         return (
 615             'http:'
 616             if self._downloader.params.get('prefer_insecure', False)
 617             else 'https:')
 618
 619     def _proto_relative_url(self, url, scheme=None):
 620         if url is None:
 621             return url
 622         if url.startswith('//'):
 623             if scheme is None:
 624                 scheme = self.http_scheme()
 625             return scheme + url
 626         else:
 627             return url
 628
 629     def _sleep(self, timeout, video_id, msg_template=None):
 630         if msg_template is None:
 631             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 632         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 633         self.to_screen(msg)
 634         time.sleep(timeout)
 635
 636     def _extract_f4m_formats(self, manifest_url, video_id):
 637         manifest = self._download_xml(
 638             manifest_url, video_id, 'Downloading f4m manifest',
 639             'Unable to download f4m manifest')
 640
 641         formats = []
 642         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 643         for i, media_el in enumerate(media_nodes):
 644             tbr = int_or_none(media_el.attrib.get('bitrate'))
 645             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 646             formats.append({
 647                 'format_id': format_id,
 648                 'url': manifest_url,
 649                 'ext': 'flv',
 650                 'tbr': tbr,
 651                 'width': int_or_none(media_el.attrib.get('width')),
 652                 'height': int_or_none(media_el.attrib.get('height')),
 653             })
 654         self._sort_formats(formats)
 655
 656         return formats
 657
 658     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 659                               entry_protocol='m3u8', preference=None):
 660
 661         formats = [{
 662             'format_id': 'm3u8-meta',
 663             'url': m3u8_url,
 664             'ext': ext,
 665             'protocol': 'm3u8',
 666             'preference': -1,
 667             'resolution': 'multiple',
 668             'format_note': 'Quality selection URL',
 669         }]
 670
 671         format_url = lambda u: (
 672             u
 673             if re.match(r'^https?://', u)
 674             else compat_urlparse.urljoin(m3u8_url, u))
 675
 676         m3u8_doc = self._download_webpage(m3u8_url, video_id)
 677         last_info = None
 678         kv_rex = re.compile(
 679             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 680         for line in m3u8_doc.splitlines():
 681             if line.startswith('#EXT-X-STREAM-INF:'):
 682                 last_info = {}
 683                 for m in kv_rex.finditer(line):
 684                     v = m.group('val')
 685                     if v.startswith('"'):
 686                         v = v[1:-1]
 687                     last_info[m.group('key')] = v
 688             elif line.startswith('#') or not line.strip():
 689                 continue
 690             else:
 691                 if last_info is None:
 692                     formats.append({'url': format_url(line)})
 693                     continue
 694                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 695
 696                 f = {
 697                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 698                     'url': format_url(line.strip()),
 699                     'tbr': tbr,
 700                     'ext': ext,
 701                     'protocol': entry_protocol,
 702                     'preference': preference,
 703                 }
 704                 codecs = last_info.get('CODECS')
 705                 if codecs:
 706                     # TODO: looks like video codec is not always necessarily goes first
 707                     va_codecs = codecs.split(',')
 708                     if va_codecs[0]:
 709                         f['vcodec'] = va_codecs[0].partition('.')[0]
 710                     if len(va_codecs) > 1 and va_codecs[1]:
 711                         f['acodec'] = va_codecs[1].partition('.')[0]
 712                 resolution = last_info.get('RESOLUTION')
 713                 if resolution:
 714                     width_str, height_str = resolution.split('x')
 715                     f['width'] = int(width_str)
 716                     f['height'] = int(height_str)
 717                 formats.append(f)
 718                 last_info = {}
 719         self._sort_formats(formats)
 720         return formats
 721
 722     def _live_title(self, name):
 723         """ Generate the title for a live video """
 724         now = datetime.datetime.now()
 725         now_str = now.strftime("%Y-%m-%d %H:%M")
 726         return name + ' ' + now_str
 727
 728     def _int(self, v, name, fatal=False, **kwargs):
 729         res = int_or_none(v, **kwargs)
 730         if 'get_attr' in kwargs:
 731             print(getattr(v, kwargs['get_attr']))
 732         if res is None:
 733             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 734             if fatal:
 735                 raise ExtractorError(msg)
 736             else:
 737                 self._downloader.report_warning(msg)
 738         return res
 739
 740     def _float(self, v, name, fatal=False, **kwargs):
 741         res = float_or_none(v, **kwargs)
 742         if res is None:
 743             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 744             if fatal:
 745                 raise ExtractorError(msg)
 746             else:
 747                 self._downloader.report_warning(msg)
 748         return res
 749
 750
 751 class SearchInfoExtractor(InfoExtractor):
 752     """
 753     Base class for paged search queries extractors.
 754     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 755     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 756     """
 757
 758     @classmethod
 759     def _make_valid_url(cls):
 760         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 761
 762     @classmethod
 763     def suitable(cls, url):
 764         return re.match(cls._make_valid_url(), url) is not None
 765
 766     def _real_extract(self, query):
 767         mobj = re.match(self._make_valid_url(), query)
 768         if mobj is None:
 769             raise ExtractorError('Invalid search query "%s"' % query)
 770
 771         prefix = mobj.group('prefix')
 772         query = mobj.group('query')
 773         if prefix == '':
 774             return self._get_n_results(query, 1)
 775         elif prefix == 'all':
 776             return self._get_n_results(query, self._MAX_RESULTS)
 777         else:
 778             n = int(prefix)
 779             if n <= 0:
 780                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 781             elif n > self._MAX_RESULTS:
 782                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 783                 n = self._MAX_RESULTS
 784             return self._get_n_results(query, n)
 785
 786     def _get_n_results(self, query, n):
 787         """Get a specified number of results for a query"""
 788         raise NotImplementedError("This method must be implemented by subclasses")
 789
 790     @property
 791     def SEARCH_KEY(self):
 792         return self._SEARCH_KEY