X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fmtv.py;h=640ee3d9339c48e2b3fef0ade15ee8ebcae8b292;hb=779822d945dc7ebba7062ac9a5e760d21a7f362a;hp=4e4358151e9c041f3c608eeaf5643a5afb90153a;hpb=054d43bb11590da234e7e8f46a595a34b19efb8c;p=youtube-dl diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 4e4358151..640ee3d93 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -4,18 +4,20 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, - compat_urllib_request, + compat_urllib_parse_urlencode, compat_str, ) from ..utils import ( ExtractorError, find_xpath_attr, fix_xml_ampersands, + float_or_none, HEADRequest, + sanitized_Request, unescapeHTML, url_basename, RegexNotFoundError, + xpath_text, ) @@ -53,7 +55,7 @@ class MTVServicesInfoExtractor(InfoExtractor): def _extract_mobile_video_formats(self, mtvn_id): webpage_url = self._MOBILE_TEMPLATE % mtvn_id - req = compat_urllib_request.Request(webpage_url) + req = sanitized_Request(webpage_url) # Otherwise we get a webpage that would execute some javascript req.add_header('User-Agent', 'curl/7') webpage = self._download_webpage(req, mtvn_id, @@ -110,7 +112,8 @@ class MTVServicesInfoExtractor(InfoExtractor): uri = itemdoc.find('guid').text video_id = self._id_from_uri(uri) self.report_extraction(video_id) - mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url'] + content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))) + mediagen_url = content_el.attrib['url'] # Remove the templates, like &device={device} mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) if 'acceptMethods' not in mediagen_url: @@ -128,11 +131,7 @@ class MTVServicesInfoExtractor(InfoExtractor): message += item.text raise ExtractorError(message, expected=True) - description_node = itemdoc.find('description') - if description_node is not None: - description = description_node.text.strip() - else: - description = None + description = xpath_text(itemdoc, 'description') title_el = None if title_el is None: @@ -142,7 +141,7 @@ class MTVServicesInfoExtractor(InfoExtractor): if title_el is None: title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title') if title_el is None: - title_el = itemdoc.find('.//title') + title_el = itemdoc.find('.//title') or itemdoc.find('./title') if title_el.text is None: title_el = None @@ -165,16 +164,19 @@ class MTVServicesInfoExtractor(InfoExtractor): 'id': video_id, 'thumbnail': self._get_thumbnail_url(uri, itemdoc), 'description': description, + 'duration': float_or_none(content_el.attrib.get('duration')), } + def _get_feed_query(self, uri): + data = {'uri': uri} + if self._LANG: + data['lang'] = self._LANG + return compat_urllib_parse_urlencode(data) + def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) feed_url = self._get_feed_url(uri) - data = compat_urllib_parse.urlencode({'uri': uri}) - info_url = feed_url + '?' - if self._LANG: - info_url += 'lang=%s&' % self._LANG - info_url += data + info_url = feed_url + '?' + self._get_feed_query(uri) return self._get_videos_info_from_url(info_url, video_id) def _get_videos_info_from_url(self, url, video_id): @@ -184,9 +186,7 @@ class MTVServicesInfoExtractor(InfoExtractor): return self.playlist_result( [self._get_video_info(item) for item in idoc.findall('.//item')]) - def _real_extract(self, url): - title = url_basename(url) - webpage = self._download_webpage(url, title) + def _extract_mgid(self, webpage): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf # or http://media.mtvnservices.com/{mgid} @@ -200,8 +200,19 @@ class MTVServicesInfoExtractor(InfoExtractor): if mgid is None or ':' not in mgid: mgid = self._search_regex( [r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'], - webpage, 'mgid') + webpage, 'mgid', default=None) + if not mgid: + sm4_embed = self._html_search_meta( + 'sm4:video:embed', webpage, 'sm4 embed', default='') + mgid = self._search_regex( + r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid') + return mgid + + def _real_extract(self, url): + title = url_basename(url) + webpage = self._download_webpage(url, title) + mgid = self._extract_mgid(webpage) videos_info = self._get_videos_info(mgid) return videos_info @@ -222,6 +233,13 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): }, } + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+?src=(["\'])(?P(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage) + if mobj: + return mobj.group('url') + def _get_feed_url(self, uri): video_id = self._id_from_uri(uri) site_id = uri.replace(video_id, '')