From: Sergey M․ Date: Fri, 13 Nov 2015 19:23:15 +0000 (+0600) Subject: Merge branch 'brightcove_in_page_embed' of https://github.com/remitamine/youtube... X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=youtube-dl;a=commitdiff_plain;h=a2973eb59733c5f86a249c627d654b789020bc7d;hp=-c Merge branch 'brightcove_in_page_embed' of https://github.com/remitamine/youtube-dl into remitamine-brightcove_in_page_embed --- a2973eb59733c5f86a249c627d654b789020bc7d diff --combined youtube_dl/extractor/__init__.py index 06d25ef40,fcd9edec3..08cb93d76 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@@ -45,7 -45,6 +45,7 @@@ from .bambuser import BambuserIE, Bambu from .bandcamp import BandcampIE, BandcampAlbumIE from .bbc import ( BBCCoUkIE, + BBCCoUkArticleIE, BBCIE, ) from .beeg import BeegIE @@@ -60,7 -59,10 +60,10 @@@ from .bloomberg import BloombergI from .bpb import BpbIE from .br import BRIE from .breakcom import BreakIE - from .brightcove import BrightcoveIE + from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, + ) from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE @@@ -77,7 -79,6 +80,7 @@@ from .cbssports import CBSSportsI from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE +from .chaturbate import ChaturbateIE from .chilloutzone import ChilloutzoneIE from .chirbit import ( ChirbitIE, @@@ -90,7 -91,6 +93,7 @@@ from .cliphunter import CliphunterI from .clipsyndicate import ClipsyndicateIE from .cloudy import CloudyIE from .clubic import ClubicIE +from .clyp import ClypIE from .cmt import CMTIE from .cnet import CNETIE from .cnn import ( @@@ -124,7 -124,6 +127,7 @@@ from .dbtv import DBTVI from .dcn import DCNIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE +from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE @@@ -142,6 -141,7 +145,6 @@@ from .dump import DumpI from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE -from .divxstage import DivxStageIE from .dropbox import DropboxIE from .eagleplatform import EaglePlatformIE from .ebaumsworld import EbaumsWorldIE @@@ -162,7 -162,6 +165,7 @@@ from .eroprofile import EroProfileI from .escapist import EscapistIE from .espn import ESPNIE from .esri import EsriVideoIE +from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE from .expotv import ExpoTVIE @@@ -170,12 -169,14 +173,12 @@@ from .extremetube import ExtremeTubeI from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE +from .fczenit import FczenitIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE from .fivetv import FiveTVIE -from .fktv import ( - FKTVIE, - FKTVPosteckeIE, -) +from .fktv import FKTVIE from .flickr import FlickrIE from .folketinget import FolketingetIE from .footyroom import FootyRoomIE @@@ -212,15 -213,13 +215,15 @@@ from .gfycat import GfycatI from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE -from .globo import GloboIE +from .globo import ( + GloboIE, + GloboArticleIE, +) from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE -from .gorillavid import GorillaVidIE from .goshgay import GoshgayIE from .groupon import GrouponIE from .hark import HarkIE @@@ -233,6 -232,7 +236,6 @@@ from .historicfilms import HistoricFilm from .history import HistoryIE from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE -from .hostingbulk import HostingBulkIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE @@@ -302,11 -302,6 +305,11 @@@ from .lifenews import LifeNewsIE, LifeEmbedIE, ) +from .limelight import ( + LimelightMediaIE, + LimelightChannelIE, + LimelightChannelListIE, +) from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, @@@ -324,6 -319,7 +327,6 @@@ from .macgamestore import MacGameStoreI from .mailru import MailRuIE from .malemotion import MalemotionIE from .mdr import MDRIE -from .megavideoz import MegaVideozIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE @@@ -374,9 -370,6 +377,9 @@@ from .nbc import from .ndr import ( NDRIE, NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, ) from .ndtv import NDTVIE from .netzkino import NetzkinoIE @@@ -412,11 -405,7 +415,11 @@@ from .normalboots import NormalbootsI from .nosvideo import NosVideoIE from .nova import NovaIE from .novamov import NovaMovIE -from .nowness import NownessIE +from .nowness import ( + NownessIE, + NownessPlaylistIE, + NownessSeriesIE, +) from .nowtv import NowTVIE from .nowvideo import NowVideoIE from .npo import ( @@@ -446,6 -435,7 +449,6 @@@ from .ooyala import OoyalaIE, OoyalaExternalIE, ) -from .openfilm import OpenFilmIE from .orf import ( ORFTVthekIE, ORFOE1IE, @@@ -591,7 -581,6 +594,7 @@@ from .spankwire import SpankwireI from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE from .spike import SpikeIE +from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import ( SportBoxIE, @@@ -696,7 -685,7 +699,7 @@@ from .twitch import TwitchBookmarksIE, TwitchStreamIE, ) -from .twitter import TwitterCardIE +from .twitter import TwitterCardIE, TwitterIE from .ubu import UbuIE from .udemy import ( UdemyIE, @@@ -723,6 -712,7 +726,6 @@@ from .vh1 import VH1I from .vice import ViceIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE -from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videomega import VideoMegaIE from .videopremium import VideoPremiumIE @@@ -732,7 -722,6 +735,7 @@@ from .vidme import VidmeI from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewster import ViewsterIE +from .viidea import ViideaIE from .vimeo import ( VimeoIE, VimeoAlbumIE, @@@ -785,7 -774,6 +788,7 @@@ from .wrzuta import WrzutaI from .wsj import WSJIE from .xbef import XBefIE from .xboxclips import XboxClipsIE +from .xfileshare import XFileShareIE from .xhamster import ( XHamsterIE, XHamsterEmbedIE, diff --combined youtube_dl/extractor/brightcove.py index 1686cdde1,c6ad1d065..2c7d968a8 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@@ -3,10 -3,10 +3,10 @@@ from __future__ import unicode_literal import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_parse_qs, compat_str, compat_urllib_parse, @@@ -22,6 -22,10 +22,10 @@@ from ..utils import fix_xml_ampersands, unescapeHTML, unsmuggle_url, + js_to_json, + int_or_none, + parse_iso8601, + extract_attributes, ) @@@ -119,7 -123,7 +123,7 @@@ class BrightcoveIE(InfoExtractor) object_str = fix_xml_ampersands(object_str) try: - object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) + object_doc = compat_etree_fromstring(object_str.encode('utf-8')) except compat_xml_parse_error: return @@@ -346,3 -350,94 +350,94 @@@ if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info + + + class BrightcoveInPageEmbedIE(InfoExtractor): + _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P\d+)' + _TEST = { + 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'md5': 'c8100925723840d4b0d243f7025703be', + 'info_dict': { + 'id': '4463358922001', + 'ext': 'mp4', + 'title': 'Meet the man behind Popcorn Time', + 'description': 'md5:eac376a4fe366edc70279bfb681aea16', + 'timestamp': 1441391203, + 'upload_date': '20150904', + 'duration': 165768, + 'uploader_id': '929656772001', + } + } + + @staticmethod + def _extract_url(webpage): + video_attributes = re.search(r'(?s)]*)>.*?', webpage) + if video_attributes: + video_attributes = extract_attributes(video_attributes.group(), r'(?s)\s*data-(account|video-id|playlist-id|policy-key|player|embed)\s*=\s*["\']([^"\']+)["\']') + account_id = video_attributes.get('account') + player_id = video_attributes.get('player') + embed = video_attributes.get('embed') + video_id = video_attributes.get('video-id') + if account_id and player_id and embed and video_id: + return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id) + return None + + def _real_extract(self, url): + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) + + catalog = self._parse_json( + js_to_json( + self._search_regex( + r'catalog\(({[^}]+})\);', + webpage, + 'catalog' + ) + ), + video_id + ) + policy_key = catalog['policyKey'] + + req = compat_urllib_request.Request( + 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id), + headers={'Accept': 'application/json;pk=%s' % policy_key}) + json_data = self._download_json(req, video_id) + + title = json_data['name'] + description = json_data.get('description') + thumbnail = json_data.get('thumbnail') + timestamp = parse_iso8601(json_data.get('published_at')) + duration = int_or_none(json_data.get('duration')) + + formats = [] + for source in json_data.get('sources'): + source_type = source.get('type') + if source_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) + else: + src = source.get('src') or source.get('streaming_src') + if src: + formats.append({ + 'url': src, + 'tbr': source.get('avg_bitrate'), + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'filesize': source.get('size'), + 'container': source.get('container'), + 'vcodec': source.get('codec'), + 'ext': source.get('container').lower(), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'uploader_id': account_id, + } diff --combined youtube_dl/extractor/generic.py index d0b486d2a,7a3a7f66b..34d930a2d --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@@ -4,12 -4,10 +4,12 @@@ from __future__ import unicode_literal import os import re +import sys from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( + compat_etree_fromstring, compat_urllib_parse_unquote, compat_urllib_request, compat_urlparse, @@@ -22,6 -20,7 +22,6 @@@ from ..utils import HEADRequest, is_html, orderedSet, - parse_xml, smuggle_url, unescapeHTML, unified_strdate, @@@ -30,7 -29,10 +30,10 @@@ url_basename, xpath_text, ) - from .brightcove import BrightcoveIE + from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, + ) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@@ -50,7 -52,6 +53,7 @@@ from .dailymotion import DailymotionClo from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE from .screenwavemedia import ScreenwaveMediaIE +from .mtv import MTVServicesEmbeddedIE class GenericIE(InfoExtractor): @@@ -141,7 -142,6 +144,7 @@@ 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'upload_date': '20130627', 'formats': 'mincount:16', 'subtitles': 'mincount:1', }, @@@ -233,22 -233,6 +236,22 @@@ 'skip_download': False, } }, + { + # redirect in Refresh HTTP header + 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', + 'info_dict': { + 'id': 'pO8h3EaFRdo', + 'ext': 'mp4', + 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', + 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', + 'upload_date': '20150917', + 'uploader_id': 'brtvofficial', + 'uploader': 'Boiler Room', + }, + 'params': { + 'skip_download': False, + }, + }, { 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', @@@ -1031,6 -1015,17 +1034,17 @@@ 'ext': 'mp4', 'title': 'cinemasnob', }, + }, + # BrightcoveInPageEmbed embed + { + 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', + 'info_dict': { + 'id': '4238694884001', + 'ext': 'flv', + 'title': 'Tabletop: Dread, Last Thoughts', + 'description': 'Tabletop: Dread, Last Thoughts', + 'duration': 51690, + }, } ] @@@ -1238,7 -1233,7 +1252,7 @@@ # Is it an RSS feed, a SMIL file or a XSPF playlist? try: - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): @@@ -1307,6 -1302,11 +1321,11 @@@ 'entries': entries, } + # Look for Brightcove In Page Embed: + brightcove_in_page_embed_url = BrightcoveInPageEmbedIE._extract_url(webpage) + if brightcove_in_page_embed_url: + return self.url_result(brightcove_in_page_embed_url, 'BrightcoveInPageEmbed') + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', @@@ -1613,9 -1613,12 +1632,9 @@@ return self.url_result(url, ie='Vulture') # Look for embedded mtvservices player - mobj = re.search( - r'