X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Ffirstpost.py;h=298227d5793770c82d8868256d655fa7ea3dc31c;hb=1fc0b47fdf9367aa71e6b81076666f137e68f637;hp=7e3d1afd215bfc8ef39c6751e116def05bf858ed;hpb=6cb38a999488b9a172c820b5db2cd2a5effa07c0;p=youtube-dl diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py index 7e3d1afd2..298227d57 100644 --- a/youtube_dl/extractor/firstpost.py +++ b/youtube_dl/extractor/firstpost.py @@ -1,12 +1,9 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class FirstpostIE(InfoExtractor): - IE_NAME = 'Firstpost.com' _VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P[0-9]+)\.html' _TEST = { @@ -16,23 +13,38 @@ class FirstpostIE(InfoExtractor): 'id': '1025403', 'ext': 'mp4', 'title': 'India to launch indigenous aircraft carrier INS Vikrant today', - 'description': 'Its flight deck is over twice the size of a football field, its power unit can light up the entire Kochi city and the cabling is enough to cover the distance between here to Delhi.', + 'description': 'md5:feef3041cb09724e0bdc02843348f5f4', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) + page = self._download_webpage(url, video_id) + + title = self._html_search_meta('twitter:title', page, 'title', fatal=True) + description = self._html_search_meta('twitter:description', page, 'title') + + data = self._download_xml( + 'http://www.firstpost.com/getvideoxml-%s.xml' % video_id, video_id, + 'Downloading video XML') + + item = data.find('./playlist/item') + thumbnail = item.find('./image').text - webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'', - webpage, 'video URL') + formats = [ + { + 'url': details.find('./file').text, + 'format_id': details.find('./label').text.strip(), + 'width': int(details.find('./width').text.strip()), + 'height': int(details.find('./height').text.strip()), + } for details in item.findall('./source/file_details') if details.find('./file').text + ] + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, }