X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fmtv.py;h=6f5180892756aec148c0e0b6172e007fe291f623;hb=d67cc9fa7c1c38fa72ed8990965ef0aeebbdb43a;hp=c23033d44ef2e9b5f510958abb0fdfab5706a461;hpb=fc287219602b67377aa38a787b4bfb0bbd94fa72;p=youtube-dl diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index c23033d44..6f5180892 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -1,73 +1,195 @@ +from __future__ import unicode_literals + import re -import socket -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_http_client, - compat_str, - compat_urllib_error, - compat_urllib_request, - + compat_urllib_parse, ExtractorError, + find_xpath_attr, + fix_xml_ampersands, + url_basename, + RegexNotFoundError, ) -class MTVIE(InfoExtractor): - _VALID_URL = r'^(?Phttps?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P[0-9]+)/[^/]+$' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - if not mobj.group('proto'): - url = 'http://' + url - video_id = mobj.group('videoid') - - webpage = self._download_webpage(url, video_id) - - song_name = self._html_search_regex(r'', - webpage, u'song name', fatal=False) - - video_title = self._html_search_regex(r'', - webpage, u'title') - - mtvn_uri = self._html_search_regex(r'', - webpage, u'mtvn_uri', fatal=False) - - content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', - webpage, u'content id', fatal=False) - - videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri +def _media_xml_tag(tag): + return '{http://search.yahoo.com/mrss/}%s' % tag + + +class MTVServicesInfoExtractor(InfoExtractor): + @staticmethod + def _id_from_uri(uri): + return uri.split(':')[-1] + + # This was originally implemented for ComedyCentral, but it also works here + @staticmethod + def _transform_rtmp_url(rtmp_video_url): + m = re.match(r'^rtmpe?://.*?/(?Pgsp\..+?/.*)$', rtmp_video_url) + if not m: + return rtmp_video_url + base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' + return base + m.group('finalid') + + def _get_thumbnail_url(self, uri, itemdoc): + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + thumb_node = itemdoc.find(search_path) + if thumb_node is None: + return None + else: + return thumb_node.attrib['url'] + + def _extract_video_formats(self, mdoc): + if re.match(r'.*/error_country_block\.swf$', mdoc.find('.//src').text) is not None: + raise ExtractorError('This video is not available from your country.', expected=True) + + formats = [] + for rendition in mdoc.findall('.//rendition'): + try: + _, _, ext = rendition.attrib['type'].partition('/') + rtmp_video_url = rendition.find('./src').text + formats.append({'ext': ext, + 'url': self._transform_rtmp_url(rtmp_video_url), + 'format_id': rendition.get('bitrate'), + 'width': int(rendition.get('width')), + 'height': int(rendition.get('height')), + }) + except (KeyError, TypeError): + raise ExtractorError('Invalid rendition field.') + return formats + + def _get_video_info(self, itemdoc): + uri = itemdoc.find('guid').text + video_id = self._id_from_uri(uri) self.report_extraction(video_id) - request = compat_urllib_request.Request(videogen_url) - try: - metadataXml = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err)) + mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url'] + # Remove the templates, like &device={device} + mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) + if 'acceptMethods' not in mediagen_url: + mediagen_url += '&acceptMethods=fms' + + mediagen_doc = self._download_xml(mediagen_url, video_id, + 'Downloading video urls') + + description_node = itemdoc.find('description') + if description_node is not None: + description = description_node.text.strip() + else: + description = None + + title_el = None + if title_el is None: + title_el = find_xpath_attr( + itemdoc, './/{http://search.yahoo.com/mrss/}category', + 'scheme', 'urn:mtvn:video_title') + if title_el is None: + title_el = itemdoc.find('.//title') + if title_el.text is None: + title_el = None + if title_el is None: + title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title') + + title = title_el.text + if title is None: + raise ExtractorError('Could not find video title') + title = title.strip() + + return { + 'title': title, + 'formats': self._extract_video_formats(mediagen_doc), + 'id': video_id, + 'thumbnail': self._get_thumbnail_url(uri, itemdoc), + 'description': description, + } - mdoc = xml.etree.ElementTree.fromstring(metadataXml) - print(metadataXml) - renditions = mdoc.findall('.//rendition') + def _get_videos_info(self, uri): + video_id = self._id_from_uri(uri) + data = compat_urllib_parse.urlencode({'uri': uri}) - # For now, always pick the highest quality. - rendition = renditions[-1] + idoc = self._download_xml( + self._FEED_URL + '?' + data, video_id, + 'Downloading info', transform_source=fix_xml_ampersands) + return [self._get_video_info(item) for item in idoc.findall('.//item')] + def _real_extract(self, url): + title = url_basename(url) + webpage = self._download_webpage(url, title) try: - _,_,ext = rendition.attrib['type'].partition('/') - format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] - video_url = rendition.find('./src').text - except KeyError: - raise ExtractorError('Invalid rendition field.') + # the url can be http://media.mtvnservices.com/fb/{mgid}.swf + # or http://media.mtvnservices.com/{mgid} + og_url = self._og_search_video_url(webpage) + mgid = url_basename(og_url) + if mgid.endswith('.swf'): + mgid = mgid[:-4] + except RegexNotFoundError: + mgid = self._search_regex( + [r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'], + webpage, u'mgid') + return self._get_videos_info(mgid) + + +class MTVIE(MTVServicesInfoExtractor): + _VALID_URL = r'''(?x)^https?:// + (?:(?:www\.)?mtv\.com/videos/.+?/(?P[0-9]+)/[^/]+$| + m\.mtv\.com/videos/video\.rbml\?.*?id=(?P[^&]+))''' + + _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/' + + _TESTS = [ + { + 'url': 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', + 'file': '853555.mp4', + 'md5': '850f3f143316b1e71fa56a4edfd6e0f8', + 'info_dict': { + 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', + 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + }, + }, + { + 'add_ie': ['Vevo'], + 'url': 'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', + 'file': 'USCJY1331283.mp4', + 'md5': '73b4e7fcadd88929292fe52c3ced8caf', + 'info_dict': { + 'title': 'Everything Has Changed', + 'upload_date': '20130606', + 'uploader': 'Taylor Swift', + }, + 'skip': 'VEVO is only available in some countries', + }, + ] + + def _get_thumbnail_url(self, uri, itemdoc): + return 'http://mtv.mtvnimages.com/uri/' + uri - info = { - 'id': video_id, - 'url': video_url, - 'uploader': performer, - 'upload_date': None, - 'title': video_title, - 'ext': ext, - 'format': format, + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + uri = mobj.groupdict().get('mgid') + if uri is None: + webpage = self._download_webpage(url, video_id) + + # Some videos come from Vevo.com + m_vevo = re.search(r'isVevoVideo = true;.*?vevoVideoId = "(.*?)";', + webpage, re.DOTALL) + if m_vevo: + vevo_id = m_vevo.group(1); + self.to_screen('Vevo video detected: %s' % vevo_id) + return self.url_result('vevo:%s' % vevo_id, ie='Vevo') + + uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri') + return self._get_videos_info(uri) + + +class MTVIggyIE(MTVServicesInfoExtractor): + IE_NAME = 'mtviggy.com' + _VALID_URL = r'https?://www\.mtviggy\.com/videos/.+' + _TEST = { + 'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/', + 'info_dict': { + 'id': '984696', + 'ext': 'mp4', + 'title': 'Arcade Fire: Behind the Scenes at the Biggest Music Experiment Yet', } - - return [info] + } + _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/'