X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fmtv.py;h=fedd5f46bba67eb5cb4304d38aa0bdffa537e11d;hb=a6211d237b4e7051ca018cc09440502561fedaa7;hp=00a980c7d8460d19d338d717a06012698b580507;hpb=adf063dad1792f0c9c680d13ccd984b4ad60ac29;p=youtube-dl diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 00a980c7d..fedd5f46b 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -50,8 +51,7 @@ class MTVServicesInfoExtractor(InfoExtractor): thumb_node = itemdoc.find(search_path) if thumb_node is None: return None - else: - return thumb_node.attrib['url'] + return thumb_node.get('url') or thumb_node.text or None def _extract_mobile_video_formats(self, mtvn_id): webpage_url = self._MOBILE_TEMPLATE % mtvn_id @@ -83,7 +83,7 @@ class MTVServicesInfoExtractor(InfoExtractor): hls_url = rendition.find('./src').text formats.extend(self._extract_m3u8_formats( hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls')) + m3u8_id='hls', fatal=False)) else: # fms try: @@ -106,7 +106,8 @@ class MTVServicesInfoExtractor(InfoExtractor): }]) except (KeyError, TypeError): raise ExtractorError('Invalid rendition field.') - self._sort_formats(formats) + if formats: + self._sort_formats(formats) return formats def _extract_subtitles(self, mdoc, mtvn_id): @@ -115,10 +116,17 @@ class MTVServicesInfoExtractor(InfoExtractor): if transcript.get('kind') != 'captions': continue lang = transcript.get('srclang') - subtitles[lang] = [{ - 'url': compat_str(typographic.get('src')), - 'ext': typographic.get('format') - } for typographic in transcript.findall('./typographic')] + for typographic in transcript.findall('./typographic'): + sub_src = typographic.get('src') + if not sub_src: + continue + ext = typographic.get('format') + if ext == 'cea-608': + ext = 'scc' + subtitles.setdefault(lang, []).append({ + 'url': compat_str(sub_src), + 'ext': ext + }) return subtitles def _get_video_info(self, itemdoc, use_hls=True): @@ -133,8 +141,11 @@ class MTVServicesInfoExtractor(InfoExtractor): mediagen_url += 'acceptMethods=' mediagen_url += 'hls' if use_hls else 'fms' - mediagen_doc = self._download_xml(mediagen_url, video_id, - 'Downloading video urls') + mediagen_doc = self._download_xml( + mediagen_url, video_id, 'Downloading video urls', fatal=False) + + if mediagen_doc is False: + return None item = mediagen_doc.find('./video/item') if item is not None and item.get('type') == 'text': @@ -174,6 +185,13 @@ class MTVServicesInfoExtractor(InfoExtractor): formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id) + # Some parts of complete video may be missing (e.g. missing Act 3 in + # http://www.southpark.de/alle-episoden/s14e01-sexual-healing) + if not formats: + return None + + self._sort_formats(formats) + return { 'title': title, 'formats': formats, @@ -205,13 +223,18 @@ class MTVServicesInfoExtractor(InfoExtractor): title = xpath_text(idoc, './channel/title') description = xpath_text(idoc, './channel/description') + entries = [] + for item in idoc.findall('.//item'): + info = self._get_video_info(item, use_hls) + if info: + entries.append(info) + return self.playlist_result( - [self._get_video_info(item, use_hls) for item in idoc.findall('.//item')], - playlist_title=title, playlist_description=description) + entries, playlist_title=title, playlist_description=description) def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None): triforce_feed = self._parse_json(self._search_regex( - r'triforceManifestFeed\s*=\s*(\{.+?\});\n', webpage, + r'triforceManifestFeed\s*=\s*({.+?})\s*;\s*\n', webpage, 'triforce feed', default='{}'), video_id, fatal=False) data_zone = self._search_regex( @@ -243,7 +266,7 @@ class MTVServicesInfoExtractor(InfoExtractor): if mgid is None or ':' not in mgid: mgid = self._search_regex( - [r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'], + [r'data-mgid="(.*?)"', r'swfobject\.embedSWF\(".*?(mgid:.*?)"'], webpage, 'mgid', default=None) if not mgid: @@ -304,7 +327,7 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): class MTVIE(MTVServicesInfoExtractor): IE_NAME = 'mtv' - _VALID_URL = r'https?://(?:www\.)?mtv\.com/(?:video-clips|full-episodes)/(?P[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?mtv\.com/(?:video-clips|(?:full-)?episodes)/(?P[^/?#.]+)' _FEED_URL = 'http://www.mtv.com/feeds/mrss/' _TESTS = [{ @@ -321,9 +344,37 @@ class MTVIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.mtv.com/full-episodes/94tujl/unlocking-the-truth-gates-of-hell-season-1-ep-101', 'only_matching': True, + }, { + 'url': 'http://www.mtv.com/episodes/g8xu7q/teen-mom-2-breaking-the-wall-season-7-ep-713', + 'only_matching': True, }] +class MTVJapanIE(MTVServicesInfoExtractor): + IE_NAME = 'mtvjapan' + _VALID_URL = r'https?://(?:www\.)?mtvjapan\.com/videos/(?P[0-9a-z]+)' + + _TEST = { + 'url': 'http://www.mtvjapan.com/videos/prayht/fresh-info-cadillac-escalade', + 'info_dict': { + 'id': 'bc01da03-6fe5-4284-8880-f291f4e368f5', + 'ext': 'mp4', + 'title': '【Fresh Info】Cadillac ESCALADE Sport Edition', + }, + 'params': { + 'skip_download': True, + }, + } + _GEO_COUNTRIES = ['JP'] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + + def _get_feed_query(self, uri): + return { + 'arcEp': 'mtvjapan.com', + 'mgid': uri, + } + + class MTVVideoIE(MTVServicesInfoExtractor): IE_NAME = 'mtv:video' _VALID_URL = r'''(?x)^https?:// @@ -371,14 +422,14 @@ class MTVVideoIE(MTVServicesInfoExtractor): class MTVDEIE(MTVServicesInfoExtractor): IE_NAME = 'mtv.de' - _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P\d+)-[^/#?]+/*(?:[#?].*)?$' + _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:musik/videoclips|folgen|news)/(?P[0-9a-z]+)' _TESTS = [{ - 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', + 'url': 'http://www.mtv.de/musik/videoclips/2gpnv7/Traum', 'info_dict': { - 'id': 'music_video-a50bc5f0b3aa4b3190aa', - 'ext': 'flv', - 'title': 'MusicVideo_cro-traum', - 'description': 'Cro - Traum', + 'id': 'd5d472bc-f5b7-11e5-bffd-a4badb20dab5', + 'ext': 'mp4', + 'title': 'Traum', + 'description': 'Traum', }, 'params': { # rtmp download @@ -387,11 +438,12 @@ class MTVDEIE(MTVServicesInfoExtractor): 'skip': 'Blocked at Travis CI', }, { # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) - 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen', + 'url': 'http://www.mtv.de/folgen/6b1ylu/teen-mom-2-enthuellungen-S5-F1', 'info_dict': { - 'id': 'local_playlist-f5ae778b9832cc837189', - 'ext': 'flv', - 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1', + 'id': '1e5a878b-31c5-11e7-a442-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'Teen Mom 2', + 'description': 'md5:dc65e357ef7e1085ed53e9e9d83146a7', }, 'params': { # rtmp download @@ -399,7 +451,7 @@ class MTVDEIE(MTVServicesInfoExtractor): }, 'skip': 'Blocked at Travis CI', }, { - 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3', + 'url': 'http://www.mtv.de/news/glolix/77491-mtv-movies-spotlight--pixels--teil-3', 'info_dict': { 'id': 'local_playlist-4e760566473c4c8c5344', 'ext': 'mp4', @@ -412,25 +464,11 @@ class MTVDEIE(MTVServicesInfoExtractor): }, 'skip': 'Das Video kann zur Zeit nicht abgespielt werden.', }] + _GEO_COUNTRIES = ['DE'] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - playlist = self._parse_json( - self._search_regex( - r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'), - video_id) - - def _mrss_url(item): - return item['mrss'] + item.get('mrssvars', '') - - # news pages contain single video in playlist with different id - if len(playlist) == 1: - return self._get_videos_info_from_url(_mrss_url(playlist[0]), video_id) - - for item in playlist: - item_id = item.get('id') - if item_id and compat_str(item_id) == video_id: - return self._get_videos_info_from_url(_mrss_url(item), video_id) + def _get_feed_query(self, uri): + return { + 'arcEp': 'mtv.de', + 'mgid': uri, + }