From: Philipp Hagemeister Date: Sat, 10 Jan 2015 16:51:52 +0000 (+0100) Subject: Merge remote-tracking branch 'Dineshs91/f4m-2.0' X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=youtube-dl;a=commitdiff_plain;h=121c09c7be1ac2944f3432122104c1952bfd1f04;hp=-c Merge remote-tracking branch 'Dineshs91/f4m-2.0' --- 121c09c7be1ac2944f3432122104c1952bfd1f04 diff --combined youtube_dl/extractor/__init__.py index b908eb38c,7918f9ab7..fc83a7d07 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@@ -1,13 -1,8 +1,13 @@@ +from __future__ import unicode_literals + from .abc import ABCIE from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE +from .adobetv import AdobeTVIE from .adultswim import AdultSwimIE from .aftonbladet import AftonbladetIE +from .aljazeera import AlJazeeraIE +from .alphaporno import AlphaPornoIE from .anitube import AnitubeIE from .anysex import AnySexIE from .aol import AolIE @@@ -25,27 -20,19 +25,27 @@@ from .arte import ArteTVDDCIE, ArteTVEmbedIE, ) +from .atresplayer import AtresPlayerIE +from .atttechchannel import ATTTechChannelIE +from .audiomack import AudiomackIE, AudiomackAlbumIE from .auengine import AUEngineIE +from .azubu import AzubuIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE from .beeg import BeegIE from .behindkink import BehindKinkIE +from .bet import BetIE +from .bild import BildIE from .bilibili import BiliBiliIE from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE +from .bpb import BpbIE from .br import BRIE from .breakcom import BreakIE from .brightcove import BrightcoveIE +from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE from .canal13cl import Canal13clIE @@@ -56,7 -43,7 +56,7 @@@ from .cbsnews import CBSNewsI from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE -from .cinemassacre import CinemassacreIE +from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE @@@ -67,20 -54,15 +67,20 @@@ from .cnet import CNETI from .cnn import ( CNNIE, CNNBlogsIE, + CNNArticleIE, ) from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE +from .comcarcoff import ComCarCoffIE +from .commonmistakes import CommonMistakesIE from .condenast import CondeNastIE from .cracked import CrackedIE from .criterion import CriterionIE -from .crunchyroll import CrunchyrollIE +from .crunchyroll import ( + CrunchyrollIE, + CrunchyrollShowPlaylistIE +) from .cspan import CSpanIE -from .d8 import D8IE from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, @@@ -94,14 -76,12 +94,14 @@@ from .dotsub import DotsubI from .dreisat import DreiSatIE from .drtuber import DrTuberIE from .drtv import DRTVIE +from .dvtv import DVTVIE from .dump import DumpIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE from .divxstage import DivxStageIE from .dropbox import DropboxIE from .ebaumsworld import EbaumsWorldIE +from .echomsk import EchoMskIE from .ehow import EHowIE from .eighttracks import EightTracksIE from .einthusan import EinthusanIE @@@ -114,7 -94,6 +114,7 @@@ from .elpais import ElPaisI from .empflix import EMPFlixIE from .engadget import EngadgetIE from .eporner import EpornerIE +from .eroprofile import EroProfileIE from .escapist import EscapistIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE @@@ -132,10 -111,7 +132,10 @@@ from .fktv import FKTVPosteckeIE, ) from .flickr import FlickrIE +from .folketinget import FolketingetIE from .fourtube import FourTubeIE +from .foxgay import FoxgayIE +from .foxnews import FoxNewsIE from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( @@@ -147,7 -123,6 +147,7 @@@ ) from .freesound import FreesoundIE from .freespeech import FreespeechIE +from .freevideo import FreeVideoIE from .funnyordie import FunnyOrDieIE from .gamekings import GamekingsIE from .gameone import ( @@@ -159,25 -134,18 +159,25 @@@ from .gamestar import GameStarI from .gametrailers import GametrailersIE from .gdcvault import GDCVaultIE from .generic import GenericIE +from .giantbomb import GiantBombIE +from .giga import GigaIE +from .glide import GlideIE from .globo import GloboIE from .godtube import GodTubeIE +from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE from .goshgay import GoshgayIE from .grooveshark import GroovesharkIE +from .groupon import GrouponIE from .hark import HarkIE from .heise import HeiseIE +from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE +from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE from .hostingbulk import HostingBulkIE from .hotnewhiphop import HotNewHipHopIE @@@ -215,7 -183,6 +215,7 @@@ from .kontrtube import KontrTubeI from .krasview import KrasViewIE from .ku6 import Ku6IE from .la7 import LA7IE +from .laola1tv import Laola1TvIE from .lifenews import LifeNewsIE from .liveleak import LiveLeakIE from .livestream import ( @@@ -236,7 -203,6 +236,7 @@@ from .mdr import MDRI from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE +from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE @@@ -263,10 -229,9 +263,10 @@@ from .muenchentv import MuenchenTVI from .musicplayon import MusicPlayOnIE from .musicvault import MusicVaultIE from .muzu import MuzuTVIE -from .myspace import MySpaceIE +from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE from .myvideo import MyVideoIE +from .myvidster import MyVidsterIE from .naver import NaverIE from .nba import NBAIE from .nbc import ( @@@ -275,14 -240,12 +275,14 @@@ ) from .ndr import NDRIE from .ndtv import NDTVIE +from .netzkino import NetzkinoIE +from .nerdcubed import NerdCubedFeedIE from .newgrounds import NewgroundsIE from .newstube import NewstubeIE from .nfb import NFBIE from .nfl import NFLIE from .nhl import NHLIE, NHLVideocenterIE -from .niconico import NiconicoIE +from .niconico import NiconicoIE, NiconicoPlaylistIE from .ninegag import NineGagIE from .noco import NocoIE from .normalboots import NormalbootsIE @@@ -303,7 -266,6 +303,7 @@@ from .nytimes import NYTimesI from .nuvid import NuvidIE from .oktoberfesttv import OktoberfestTVIE from .ooyala import OoyalaIE +from .openfilm import OpenFilmIE from .orf import ( ORFTVthekIE, ORFOE1IE, @@@ -312,7 -274,6 +312,7 @@@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE +from .phoenix import PhoenixIE from .photobucket import PhotobucketIE from .planetaplay import PlanetaPlayIE from .played import PlayedIE @@@ -326,30 -287,25 +326,31 @@@ from .pornoxo import PornoXOI from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE from .pyvideo import PyvideoIE +from .quickvid import QuickVidIE +from .radiode import RadioDeIE +from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE from .rai import RaiIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE +from .restudy import RestudyIE from .reverbnation import ReverbNationIE from .ringtv import RingTVIE from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtbf import RTBFIE + from .rte import RteIE from .rtlnl import RtlXlIE from .rtlnow import RTLnowIE +from .rtp import RTPIE from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE from .ruhd import RUHDIE from .rutube import ( RutubeIE, RutubeChannelIE, + RutubeEmbedIE, RutubeMovieIE, RutubePersonIE, ) @@@ -359,10 -315,7 +360,10 @@@ from .savefrom import SaveFromI from .sbs import SBSIE from .scivee import SciVeeIE from .screencast import ScreencastIE +from .screencastomatic import ScreencastOMaticIE +from .screenwavemedia import CinemassacreIE, ScreenwaveMediaIE, TeamFourIE from .servingsys import ServingSysIE +from .sexu import SexuIE from .sexykarma import SexyKarmaIE from .shared import SharedIE from .sharesix import ShareSixIE @@@ -397,7 -350,6 +398,7 @@@ from .spike import SpikeI from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE +from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamcloud import StreamcloudIE @@@ -408,7 -360,6 +409,7 @@@ from .syfy import SyfyI from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE from .tapely import TapelyIE +from .tass import TassIE from .teachertube import ( TeacherTubeIE, TeacherTubeUserIE, @@@ -417,10 -368,8 +418,10 @@@ from .teachingchannel import TeachingCh from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telemb import TeleMBIE +from .teletask import TeleTaskIE from .tenplay import TenPlayIE from .testurl import TestURLIE from .tf1 import TF1IE @@@ -430,7 -379,6 +431,7 @@@ from .thesixtyone import TheSixtyOneI from .thisav import ThisAVIE from .tinypic import TinyPicIE from .tlc import TlcIE, TlcDeIE +from .tmz import TMZIE from .tnaflix import TNAFlixIE from .thvideo import ( THVideoIE, @@@ -444,13 -392,11 +445,13 @@@ from .trutube import TruTubeI from .tube8 import Tube8IE from .tudou import TudouIE from .tumblr import TumblrIE +from .tunein import TuneInIE from .turbo import TurboIE from .tutv import TutvIE from .tvigle import TvigleIE -from .tvp import TvpIE +from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE +from .twentyfourvideo import TwentyFourVideoIE from .twitch import TwitchIE from .ubu import UbuIE from .udemy import ( @@@ -467,7 -413,6 +468,7 @@@ from .vesti import VestiI from .vevo import VevoIE from .vgtv import VGTVIE from .vh1 import VH1IE +from .vice import ViceIE from .viddler import ViddlerIE from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE @@@ -478,8 -423,6 +479,8 @@@ from .videopremium import VideoPremiumI from .videott import VideoTtIE from .videoweed import VideoWeedIE from .vidme import VidmeIE +from .vidzi import VidziIE +from .vier import VierIE, VierVideosIE from .vimeo import ( VimeoIE, VimeoAlbumIE, @@@ -496,13 -439,9 +497,13 @@@ from .vine import VineUserIE, ) from .viki import VikiIE -from .vk import VKIE +from .vk import ( + VKIE, + VKUserVideosIE, +) from .vodlocker import VodlockerIE from .vporn import VpornIE +from .vrt import VRTIE from .vube import VubeIE from .vuclip import VuClipIE from .vulture import VultureIE @@@ -515,7 -454,6 +516,7 @@@ from .wdr import WDRMobileIE, WDRMausIE, ) +from .webofstories import WebOfStoriesIE from .weibo import WeiboIE from .wimp import WimpIE from .wistia import WistiaIE @@@ -524,16 -462,13 +525,16 @@@ from .wrzuta import WrzutaI from .xbef import XBefIE from .xboxclips import XboxClipsIE from .xhamster import XHamsterIE +from .xminus import XMinusIE from .xnxx import XNXXIE from .xvideos import XVideosIE from .xtube import XTubeUserIE, XTubeIE +from .xxxymovies import XXXYMoviesIE from .yahoo import ( YahooIE, YahooSearchIE, ) +from .yesjapan import YesJapanIE from .ynet import YnetIE from .youjizz import YouJizzIE from .youku import YoukuIE @@@ -551,16 -486,14 +552,16 @@@ from .youtube import YoutubeSearchURLIE, YoutubeShowIE, YoutubeSubscriptionsIE, - YoutubeTopListIE, + YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeUserIE, YoutubeWatchLaterIE, ) - -from .zdf import ZDFIE - +from .zdf import ZDFIE, ZDFChannelIE +from .zingmp3 import ( + ZingMp3SongIE, + ZingMp3AlbumIE, +) _ALL_CLASSES = [ klass @@@ -577,17 -510,6 +578,17 @@@ def gen_extractors() return [klass() for klass in _ALL_CLASSES] +def list_extractors(age_limit): + """ + Return a list of extractors that are suitable for the given age, + sorted by extractor ID. + """ + + return sorted( + filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()), + key=lambda ie: ie.IE_NAME.lower()) + + def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" - return globals()[ie_name+'IE'] + return globals()[ie_name + 'IE'] diff --combined youtube_dl/extractor/common.py index 363e2000c,a8674dd85..03f3f18c8 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@@ -12,16 -12,13 +12,16 @@@ import sy import time import xml.etree.ElementTree -from ..utils import ( +from ..compat import ( + compat_cookiejar, compat_http_client, compat_urllib_error, compat_urllib_parse_urlparse, compat_urlparse, compat_str, - +) +from ..utils import ( + age_restricted, clean_html, compiled_regex_type, ExtractorError, @@@ -41,15 -38,11 +41,15 @@@ class InfoExtractor(object) information about the video (or videos) the URL refers to. This information includes the real video URL, the video title, author and others. The information is stored in a dictionary which is then - passed to the FileDownloader. The FileDownloader processes this + passed to the YoutubeDL. The YoutubeDL processes this information possibly downloading the video to the file system, among other possible outcomes. - The dictionaries must include the following fields: + The type field determines the the type of the result. + By far the most common value (and the default if _type is missing) is + "video", which indicates a single video. + + For a video, the dictionaries must include the following fields: id: Video identifier. title: Video title, unescaped. @@@ -79,7 -72,6 +79,7 @@@ * acodec Name of the audio codec in use * asr Audio sampling rate in Hertz * vbr Average video bitrate in KBit/s + * fps Frame rate * vcodec Name of the video codec in use * container Name of the container format * filesize The number of bytes, if known in advance @@@ -93,30 -85,16 +93,30 @@@ by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. + < -1000 to hide the format (if there is + another one which is strictly better) + * language_preference Is this in the correct requested + language? + 10 if it's what the URL is about, + -1 for default (don't know), + -10 otherwise, other values reserved for now. * quality Order number of the video quality of this format, irrespective of the file format. -1 for default (order by other properties), -2 or smaller for less than default. + * source_preference Order number for this video source + (quality takes higher priority) + -1 for default (order by other properties), + -2 or smaller for less than default. * http_referer HTTP Referer header value to set. * http_method HTTP method to use for the download. * http_headers A dictionary of additional HTTP headers to add to the request. * http_post_data Additional data to send with a POST request. + * stretched_ratio If given and not 1, indicates that the + video's pixels are not square. + width : height ratio as float. url: Final video URL. ext: Video filename extension. format: The video format, defaults to ext (used for --get-format) @@@ -124,7 -102,6 +124,7 @@@ The following fields are optional: + alt_title: A secondary title of the video. display_id An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is something like "4234987", title "Dancing naked mole rats", @@@ -136,7 -113,7 +136,7 @@@ * "resolution" (optional, string "{width}x{height"}, deprecated) thumbnail: Full URL to a video thumbnail image. - description: One-line video description. + description: Full video description. uploader: Full name of the video uploader. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). @@@ -150,17 -127,6 +150,17 @@@ like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video comment_count: Number of comments on the video + comments: A list of comments, each with one or more of the following + properties (all but one of text or html optional): + * "author" - human-readable name of the comment author + * "author_id" - user ID of the comment author + * "id" - Comment ID + * "html" - Comment as HTML + * "text" - Plain text of the comment + * "timestamp" - UNIX timestamp of comment + * "parent" - ID of the comment this one is replying to. + Set to "root" to indicate that this is a + comment to the original video. age_limit: Age restriction for the video, as an integer (years) webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set @@@ -174,39 -140,6 +174,39 @@@ Unless mentioned otherwise, None is equivalent to absence of information. + + _type "playlist" indicates multiple videos. + There must be a key "entries", which is a list, an iterable, or a PagedList + object, each element of which is a valid dictionary by this specification. + + Additionally, playlists can have "title" and "id" attributes with the same + semantics as videos (see above). + + + _type "multi_video" indicates that there are multiple videos that + form a single show, for examples multiple acts of an opera or TV episode. + It must have an entries key like a playlist and contain all the keys + required for a video at the same time. + + + _type "url" indicates that the video must be extracted from another + location, possibly by a different extractor. Its only required key is: + "url" - the next URL to extract. + The key "ie_key" can be set to the class name (minus the trailing "IE", + e.g. "Youtube") if the extractor class is known in advance. + Additionally, the dictionary may have any properties of the resolved entity + known in advance, for example "title" if the title of the referred video is + known ahead of time. + + + _type "url_transparent" entities have the same specification as "url", but + indicate that the given additional information is more precise than the one + associated with the resolved URL. + This is useful when a site employs a video service that hosts the video and + its technical metadata, but that video service does not embed a useful + title, description etc. + + Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. @@@ -305,6 -238,7 +305,6 @@@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns a tuple (page content as string, URL handle) """ - # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] @@@ -313,14 -247,8 +313,14 @@@ if urlh is False: assert not fatal return False + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) + return (content, urlh) + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() + if prefix is not None: + webpage_bytes = prefix + webpage_bytes m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) @@@ -377,21 -305,11 +377,21 @@@ msg += ' Visit %s for more details' % blocked_iframe raise ExtractorError(msg, expected=True) - return (content, urlh) + return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5): """ Returns the data of the page as a string """ - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) + success = False + try_count = 0 + while success is False: + try: + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) + success = True + except compat_http_client.IncompleteRead as e: + try_count += 1 + if try_count >= tries: + raise e + self._sleep(timeout, video_id) if res is False: return res else: @@@ -419,10 -337,6 +419,10 @@@ url_or_request, video_id, note, errnote, fatal=fatal) if (not fatal) and json_string is False: return None + return self._parse_json( + json_string, video_id, transform_source=transform_source, fatal=fatal) + + def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): if transform_source: json_string = transform_source(json_string) try: @@@ -459,20 -373,19 +459,20 @@@ """Report attempt to log in.""" self.to_screen('Logging in') - #Methods for following #608 + # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None): """Returns a url that points to a page that should be processed""" - #TODO: ie should be the class used for getting the info + # TODO: ie should be the class used for getting the info video_info = {'_type': 'url', 'url': url, 'ie_key': ie} if video_id is not None: video_info['id'] = video_id return video_info + @staticmethod - def playlist_result(entries, playlist_id=None, playlist_title=None): + def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): """Returns a playlist""" video_info = {'_type': 'playlist', 'entries': entries} @@@ -480,11 -393,9 +480,11 @@@ video_info['id'] = playlist_id if playlist_title: video_info['title'] = playlist_title + if playlist_description: + video_info['description'] = playlist_description return video_info - def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): + def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. @@@ -505,25 -416,22 +505,25 @@@ _name = name if mobj: - # return the first matching group - return next(g for g in mobj.groups() if g is not None) + if group is None: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + else: + return mobj.group(group) elif default is not _NO_DEFAULT: return default elif fatal: raise RegexNotFoundError('Unable to extract %s' % _name) else: self._downloader.report_warning('unable to extract %s; ' - 'please report this issue on http://yt-dl.org/bug' % _name) + 'please report this issue on http://yt-dl.org/bug' % _name) return None - def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): + def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. """ - res = self._search_regex(pattern, string, name, default, fatal, flags) + res = self._search_regex(pattern, string, name, default, fatal, flags, group) if res: return clean_html(res).strip() else: @@@ -556,7 -464,7 +556,7 @@@ raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: self._downloader.report_warning('parsing .netrc: %s' % compat_str(err)) - + return (username, password) def _get_tfa_info(self): @@@ -616,10 -524,10 +616,10 @@@ if display_name is None: display_name = name return self._html_search_regex( - r'''(?ix)]+(?:itemprop|name|property)=["\']?%s["\']?) - [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), - html, display_name, fatal=fatal, **kwargs) + r'''(?isx)]+(?:itemprop|name|property)=(["\']?)%s\1) + [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(name), + html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): return self._html_search_meta('dc.creator', html, 'uploader') @@@ -650,7 -558,7 +650,7 @@@ def _twitter_search_player(self, html): return self._html_search_meta('twitter:player', html, - 'twitter card player') + 'twitter card player') def _sort_formats(self, formats): if not formats: @@@ -695,7 -603,6 +695,7 @@@ return ( preference, + f.get('language_preference') if f.get('language_preference') is not None else -1, f.get('quality') if f.get('quality') is not None else -1, f.get('height') if f.get('height') is not None else -1, f.get('width') if f.get('width') is not None else -1, @@@ -704,16 -611,14 +704,16 @@@ f.get('vbr') if f.get('vbr') is not None else -1, f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, + f.get('fps') if f.get('fps') is not None else -1, f.get('filesize') if f.get('filesize') is not None else -1, f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, + f.get('source_preference') if f.get('source_preference') is not None else -1, f.get('format_id'), ) formats.sort(key=_formats_key) def http_scheme(self): - """ Either "https:" or "https:", depending on the user's preferences """ + """ Either "http:" or "https:", depending on the user's preferences """ return ( 'http:' if self._downloader.params.get('prefer_insecure', False) @@@ -742,8 -647,14 +742,14 @@@ 'Unable to download f4m manifest') formats = [] + manifest_version = '1.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') + if not media_nodes: + manifest_version = '2.0' + media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') for i, media_el in enumerate(media_nodes): + if manifest_version == '2.0': + manifest_url = '/'.join(manifest_url.split('/')[:-1]) + '/' + media_el.attrib.get('href') tbr = int_or_none(media_el.attrib.get('bitrate')) format_id = 'f4m-%d' % (i if tbr is None else tbr) formats.append({ @@@ -776,10 -687,7 +782,10 @@@ if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc = self._download_webpage(m3u8_url, video_id) + m3u8_doc = self._download_webpage( + m3u8_url, video_id, + note='Downloading m3u8 information', + errnote='Failed to download m3u8 information') last_info = None kv_rex = re.compile( r'(?P[a-zA-Z_-]+)=(?P"[^"]+"|[^",]+)(?:,|$)') @@@ -825,49 -733,6 +831,49 @@@ self._sort_formats(formats) return formats + # TODO: improve extraction + def _extract_smil_formats(self, smil_url, video_id): + smil = self._download_xml( + smil_url, video_id, 'Downloading SMIL file', + 'Unable to download SMIL file') + + base = smil.find('./head/meta').get('base') + + formats = [] + rtmp_count = 0 + for video in smil.findall('./body/switch/video'): + src = video.get('src') + if not src: + continue + bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + proto = video.get('proto') + if not proto: + if base: + if base.startswith('rtmp'): + proto = 'rtmp' + elif base.startswith('http'): + proto = 'http' + ext = video.get('ext') + if proto == 'm3u8': + formats.extend(self._extract_m3u8_formats(src, video_id, ext)) + elif proto == 'rtmp': + rtmp_count += 1 + streamer = video.get('streamer') or base + formats.append({ + 'url': streamer, + 'play_path': src, + 'ext': 'flv', + 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'width': width, + 'height': height, + }) + self._sort_formats(formats) + + return formats + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@@ -896,41 -761,6 +902,41 @@@ self._downloader.report_warning(msg) return res + def _set_cookie(self, domain, name, value, expire_time=None): + cookie = compat_cookiejar.Cookie( + 0, name, value, None, None, domain, None, + None, '/', True, False, expire_time, '', None, None, None) + self._downloader.cookiejar.set_cookie(cookie) + + def get_testcases(self, include_onlymatching=False): + t = getattr(self, '_TEST', None) + if t: + assert not hasattr(self, '_TESTS'), \ + '%s has _TEST and _TESTS' % type(self).__name__ + tests = [t] + else: + tests = getattr(self, '_TESTS', []) + for t in tests: + if not include_onlymatching and t.get('only_matching', False): + continue + t['name'] = type(self).__name__[:-len('IE')] + yield t + + def is_suitable(self, age_limit): + """ Test whether the extractor is generally suitable for the given + age limit (i.e. pornographic sites are not, all others usually are) """ + + any_restricted = False + for tc in self.get_testcases(include_onlymatching=False): + if 'playlist' in tc: + tc = tc['playlist'][0] + is_restricted = age_restricted( + tc.get('info_dict', {}).get('age_limit'), age_limit) + if not is_restricted: + return True + any_restricted = any_restricted or is_restricted + return not any_restricted + class SearchInfoExtractor(InfoExtractor): """