X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgeneric.py;h=76852f9dc82b046bf319d075ae7ca67c58b7d205;hb=54fc90aabfb71968f28af68dfe3f7a3544cc2f0b;hp=e3cb5c5ce51de5d5fc4aac16c64443b60e91b478;hpb=069937151e429a2127569910d204c03eec167f0d;p=youtube-dl diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e3cb5c5ce..76852f9dc 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -23,6 +23,7 @@ from ..utils import ( is_html, js_to_json, KNOWN_EXTENSIONS, + merge_dicts, mimetype2ext, orderedSet, sanitized_Request, @@ -106,6 +107,7 @@ from .springboardplatform import SpringboardPlatformIE from .yapfiles import YapFilesIE from .vice import ViceIE from .xfileshare import XFileShareIE +from .cloudflarestream import CloudflareStreamIE class GenericIE(InfoExtractor): @@ -190,6 +192,16 @@ class GenericIE(InfoExtractor): 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', } }, + # RSS feed with enclosures and unsupported link URLs + { + 'url': 'http://www.hellointernet.fm/podcast?format=rss', + 'info_dict': { + 'id': 'http://www.hellointernet.fm/podcast?format=rss', + 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', + 'title': 'Hello Internet', + }, + 'playlist_mincount': 100, + }, # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng { 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', @@ -1220,7 +1232,7 @@ class GenericIE(InfoExtractor): 'title': '35871', 'timestamp': 1355743100, 'upload_date': '20121217', - 'uploader_id': 'batchUser', + 'uploader_id': 'cplapp@learn360.com', }, 'add_ie': ['Kaltura'], }, @@ -1271,6 +1283,39 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # Kaltura iframe embed, more sophisticated + 'url': 'http://www.cns.nyu.edu/~eero/math-tools/Videos/lecture-05sep2017.html', + 'info_dict': { + 'id': '1_9gzouybz', + 'ext': 'mp4', + 'title': 'lecture-05sep2017', + 'description': 'md5:40f347d91fd4ba047e511c5321064b49', + 'upload_date': '20170913', + 'uploader_id': 'eps2', + 'timestamp': 1505340777, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Kaltura'], + }, + { + # meta twitter:player + 'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/', + 'info_dict': { + 'id': '0_01b42zps', + 'ext': 'mp4', + 'title': 'Main Twerk (Video)', + 'upload_date': '20171208', + 'uploader_id': 'sebastian.salinas@thechive.com', + 'timestamp': 1512713057, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Kaltura'], + }, # referrer protected EaglePlatform embed { 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', @@ -1427,21 +1472,6 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': ['Failed to parse JSON Expecting value'], }, - # Ooyala embed - { - 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', - 'info_dict': { - 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', - 'ext': 'mp4', - 'description': 'Index/Match versus VLOOKUP.', - 'title': 'This is what separates the Excel masters from the wannabes', - 'duration': 191.933, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - } - }, # Brightcove URL in single quotes { 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', @@ -1969,6 +1999,19 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # CloudflareStream embed + 'url': 'https://www.cloudflare.com/products/cloudflare-stream/', + 'info_dict': { + 'id': '31c9291ab41fac05471db4e73aa11717', + 'ext': 'mp4', + 'title': '31c9291ab41fac05471db4e73aa11717', + }, + 'add_ie': [CloudflareStreamIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -2009,13 +2052,15 @@ class GenericIE(InfoExtractor): entries = [] for it in doc.findall('./channel/item'): - next_url = xpath_text(it, 'link', fatal=False) + next_url = None + enclosure_nodes = it.findall('./enclosure') + for e in enclosure_nodes: + next_url = e.attrib.get('url') + if next_url: + break + if not next_url: - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break + next_url = xpath_text(it, 'link', fatal=False) if not next_url: continue @@ -2979,6 +3024,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) + cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage) + if cloudflarestream_urls: + return self.playlist_from_matches( + cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r']+?\bsrc\s*=\s*(["\'])(?P(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] @@ -2986,21 +3036,6 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( sharevideos_urls, video_id, video_title) - def merge_dicts(dict1, dict2): - merged = {} - for k, v in dict1.items(): - if v is not None: - merged[k] = v - for k, v in dict2.items(): - if v is None: - continue - if (k not in merged or - (isinstance(v, compat_str) and v and - isinstance(merged[k], compat_str) and - not merged[k])): - merged[k] = v - return merged - # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: