X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;ds=inline;f=youtube_dl%2Fextractor%2Fpornhub.py;h=6d57e1d35eb567b26f65dea8cc74c653f955c08f;hb=b5aad37f6bdc72acaca198202dc9f7eaa3185e51;hp=ac298d0ce95527d14482abafa20c0d82f490f57d;hpb=3454139576ad98b62162ba0a9bca4b342c5d07ce;p=youtube-dl diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index ac298d0ce..6d57e1d35 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import itertools @@ -39,7 +40,25 @@ class PornHubIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'age_limit': 18, - } + }, + }, { + # non-ASCII title + 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002', + 'info_dict': { + 'id': '1331683002', + 'ext': 'mp4', + 'title': '重庆婷婷女王足交', + 'uploader': 'cj397186295', + 'duration': 1753, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, @@ -76,19 +95,25 @@ class PornHubIE(InfoExtractor): 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) + # video_title from flashvars contains whitespace instead of non-ASCII (see + # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying + # on that anymore. + title = self._html_search_meta( + 'twitter:title', webpage, default=None) or self._search_regex( + (r']+class=["\']title["\'][^>]*>(?P[^<]+)', + r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', + r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), + webpage, 'title', group='title') + flashvars = self._parse_json( self._search_regex( - r'var\s+flashv1ars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), + r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), video_id) if flashvars: - video_title = flashvars.get('video_title') thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) else: - video_title, thumbnail, duration = [None] * 3 - - if not video_title: - video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title') + title, thumbnail, duration = [None] * 3 video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<', @@ -137,7 +162,7 @@ class PornHubIE(InfoExtractor): return { 'id': video_id, 'uploader': video_uploader, - 'title': video_title, + 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count,