end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
caption = unescapeHTML(caption)
caption = unescapeHTML(caption) # double cycle, intentional
- srt += str(n) + '\n'
+ srt += str(n+1) + '\n'
srt += start + ' --> ' + end + '\n'
srt += caption + '\n\n'
return srt
self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
return
+ # Check for "rental" videos
+ if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
+ self._downloader.trouble(u'ERROR: "rental" videos not supported')
+ return
+
# Start extracting information
self.report_information_extraction(video_id)
srt_list = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
- srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
+ srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
+ srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
if not srt_lang_list:
raise Trouble(u'WARNING: video has no closed captions')
if self._downloader.params.get('subtitleslang', False):
elif 'en' in srt_lang_list:
srt_lang = 'en'
else:
- srt_lang = srt_lang_list[0]
+ srt_lang = srt_lang_list.keys()[0]
if not srt_lang in srt_lang_list:
raise Trouble(u'WARNING: no closed captions found in the specified language')
- request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
+ request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
try:
srt_xml = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
+ if not srt_xml:
+ raise Trouble(u'WARNING: unable to download video subtitles')
video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
except Trouble as trouble:
self._downloader.trouble(trouble[0])
InfoExtractor.__init__(self, downloader)
def report_download_page(self, query, pagenum):
- """Report attempt to download playlist page with given number."""
+ """Report attempt to download search page with given number."""
query = query.decode(preferredencoding())
self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
_VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
_TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
- _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
- _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
+ _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=(PL)?%s&'
+ _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
IE_NAME = u'youtube:playlist'
def __init__(self, downloader=None):
self.report_extraction(showName)
try:
- webPageBytes = urllib2.urlopen(url).read()
+ webPage = urllib2.urlopen(url)
+ webPageBytes = webPage.read()
+ m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
+ webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
return
- webPage = webPageBytes.decode('utf-8')
descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
description = unescapeHTML(descMatch.group(1))
imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
# Extract video thumbnail
- mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
+ mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
return
- video_thumbnail = mobj.group(1).decode('utf-8')
+ video_thumbnail = mobj.group(0).decode('utf-8')
info = {
'id': video_id,
}
return [info]
+
+class XNXXIE(InfoExtractor):
+ """Information extractor for xnxx.com"""
+
+ _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
+ IE_NAME = u'xnxx'
+ VIDEO_URL_RE = r'flv_url=(.*?)&'
+ VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
+ VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
+
+ def report_webpage(self, video_id):
+ """Report information extraction"""
+ self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
+
+ def report_extraction(self, video_id):
+ """Report information extraction"""
+ self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
+
+ def extract_video_url(self, webpage):
+ "Extract the url for the video from the webpage"
+
+ result = re.search(self.VIDEO_URL_RE, webpage)
+ if result is None:
+ self._downloader.trouble(u'ERROR: unable to extract video url')
+ return urllib.unquote(result.group(1).decode('utf-8'))
+
+ def extract_video_title(self, webpage):
+ "Extract the title for the video from the webpage"
+
+ result = re.search(self.VIDEO_TITLE_RE, webpage)
+ if result is None:
+ self._downloader.trouble(u'ERROR: unable to extract video title')
+ return result.group(1).decode('utf-8')
+
+ def extract_video_thumbnail(self, webpage):
+ "Extract the thumbnail for the video from the webpage"
+
+ result = re.search(self.VIDEO_THUMB_RE, webpage)
+ if result is None:
+ self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
+ return result.group(1).decode('utf-8')
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+ video_id = mobj.group(1).decode('utf-8')
+
+ self.report_webpage(video_id)
+
+ # Get webpage content
+ try:
+ webpage = urllib2.urlopen(url).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
+ return
+
+ info = {'id': video_id,
+ 'url': self.extract_video_url(webpage),
+ 'uploader': None,
+ 'upload_date': None,
+ 'title': self.extract_video_title(webpage),
+ 'ext': 'flv',
+ 'format': 'flv',
+ 'thumbnail': self.extract_video_thumbnail(webpage),
+ 'description': None,
+ 'player_url': None}
+
+ return [info]
\ No newline at end of file