X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=5d54e93e7808c630fe1519d7c9a06a32da88cbfd;hb=8409501206e37d57f01e5fe72bfc54a5562e4e0a;hp=0f1880756f050d7c0d6d790bde39754cd810f183;hpb=476203d025dd2619ea9f9e2f99ffce507dec6596;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 0f1880756..5d54e93e7 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3161,7 +3161,7 @@ class GooglePlusIE(InfoExtractor): }] class NBAIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$' + _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' IE_NAME = u'nba' def _real_extract(self, url): @@ -3170,8 +3170,6 @@ class NBAIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1) - if video_id.endswith('/index.html'): - video_id = video_id[:-len('/index.html')] webpage = self._download_webpage(url, video_id) @@ -3181,7 +3179,8 @@ class NBAIE(InfoExtractor): title = self._search_regex(r'Date: (.*?)', webpage, 'upload_date', fatal=False) + # It isn't there in the HTML it returns to us + # uploader_date = self._search_regex(r'Date: (.*?)', webpage, 'upload_date', fatal=False) description = self._search_regex(r'', webpage, 'description', fatal=False) @@ -3190,7 +3189,7 @@ class NBAIE(InfoExtractor): 'url': video_url, 'ext': 'mp4', 'title': title, - 'uploader_date': uploader_date, + # 'uploader_date': uploader_date, 'description': description, } return [info] @@ -3348,7 +3347,7 @@ class FunnyOrDieIE(InfoExtractor): title = clean_html(title) video_description = self._search_regex(r'(?P.*)</h1>', - webpage, u'title').strip() - - # Get the video date - upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>', - webpage, u'upload date', fatal=False) - if upload_date: upload_date = unified_strdate(upload_date.strip()) + # Get JSON parameters + json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters') + try: + params = json.loads(json_params) + except: + raise ExtractorError(u'Invalid JSON') - # Get the video uploader - video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>', - webpage, u'uploader', fatal=False) - if video_uploader: video_uploader = clean_html(video_uploader.strip()) + self.report_extraction(video_id) + try: + video_title = params['title'] + upload_date = unified_strdate(params['release_date_f']) + video_description = params['description'] + video_uploader = params['submitted_by'] + thumbnail = params['thumbnails'][0]['image'] + except KeyError: + raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) # Get all of the formats available DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' @@ -3592,9 +3594,8 @@ class YouPornIE(InfoExtractor): 'title': title, 'ext': extension, 'format': format, - 'thumbnail': None, - 'description': None, - 'player_url': None + 'thumbnail': thumbnail, + 'description': video_description }) if self._downloader.params.get('listformats', None): @@ -4300,7 +4301,7 @@ class TeamcocoIE(InfoExtractor): 'thumbnail': thumbnail, 'description': video_description, }] - + class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html' @@ -4309,8 +4310,9 @@ class XHamsterIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - mrss_url='http://xhamster.com/movies/%s/.html' % video_id + mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id webpage = self._download_webpage(mrss_url, video_id) + mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) if mobj is None: raise ExtractorError(u'Unable to extract media URL') @@ -4320,32 +4322,26 @@ class XHamsterIE(InfoExtractor): video_url = mobj.group('server')+'/key='+mobj.group('file') video_extension = video_url.split('.')[-1] - mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = unescapeHTML(mobj.group('title')) + video_title = self._search_regex(r'(?P<title>.+?) - xHamster\.com', + webpage, u'title') + video_title = unescapeHTML(video_title) - mobj = re.search(r'Description: (?P[^<]+)', webpage) - if mobj is None: - video_description = u'' - else: - video_description = unescapeHTML(mobj.group('description')) + video_description = self._search_regex(r'Description: (?P[^<]+)', + webpage, u'description', fatal=False) + if video_description: video_description = unescapeHTML(video_description) mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract upload date') - video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') - - mobj = re.search(r']+>(?P[^>]+)', webpage) - if mobj is None: - video_uploader_id = u'anonymous' + if mobj: + video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') else: - video_uploader_id = mobj.group('uploader_id') + video_upload_date = None + self._downloader.report_warning(u'Unable to extract upload date') - mobj = re.search(r'\'image\':\'(?P[^\']+)\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract thumbnail URL') - video_thumbnail = mobj.group('thumbnail') + video_uploader_id = self._search_regex(r']+>(?P[^>]+)', + webpage, u'uploader id', default=u'anonymous') + + video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, @@ -4376,10 +4372,9 @@ class HypemIE(InfoExtractor): cookie = urlh.headers.get('Set-Cookie', '') self.report_extraction(track_id) - mobj = re.search(r'', response, flags=re.MULTILINE|re.DOTALL) - if mobj is None: - raise ExtractorError(u'Unable to extrack tracks') - html_tracks = mobj.group(1).strip() + + html_tracks = self._search_regex(r'', + response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() try: track_list = json.loads(html_tracks) track = track_list[u'tracks'][0]