X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2F__init__.py;h=dc7ec136fcbd27957633ccf037bad6e4e136cb43;hb=7a8501e307ec1283aeacb03b471b5509b8c92854;hp=789724041be58dc3494057bc8e132efe0e090aa4;hpb=9c228928b6a636ff4ed294e170e3de36d9bb45c5;p=youtube-dl diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 789724041..dc7ec136f 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -15,6 +15,7 @@ __authors__ = ( 'Kevin Ngo', 'Ori Avtalion', 'shizeeg', + 'Filippo Valsorda', ) __license__ = 'Public Domain' @@ -66,11 +67,6 @@ try: except ImportError: from cgi import parse_qs -try: - import lxml.etree -except ImportError: - pass # Handled below - try: import xml.etree.ElementTree except ImportError: # Python<2.5: Not officially supported, but let it slip @@ -197,6 +193,72 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')') return res + +class IDParser(HTMLParser.HTMLParser): + """Modified HTMLParser that isolates a tag with the specified id""" + def __init__(self, id): + self.id = id + self.result = None + self.started = False + self.depth = {} + self.html = None + self.watch_startpos = False + HTMLParser.HTMLParser.__init__(self) + + def loads(self, html): + self.html = html + self.feed(html) + self.close() + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if self.started: + self.find_startpos(None) + if 'id' in attrs and attrs['id'] == self.id: + self.result = [tag] + self.started = True + self.watch_startpos = True + if self.started: + if not tag in self.depth: self.depth[tag] = 0 + self.depth[tag] += 1 + + def handle_endtag(self, tag): + if self.started: + if tag in self.depth: self.depth[tag] -= 1 + if self.depth[self.result[0]] == 0: + self.started = False + self.result.append(self.getpos()) + + def find_startpos(self, x): + """Needed to put the start position of the result (self.result[1]) + after the opening tag with the requested id""" + if self.watch_startpos: + self.watch_startpos = False + self.result.append(self.getpos()) + handle_entityref = handle_charref = handle_data = handle_comment = \ + handle_decl = handle_pi = unknown_decl = find_startpos + + def get_result(self): + if self.result == None: return None + if len(self.result) != 3: return None + lines = self.html.split('\n') + lines = lines[self.result[1][0]-1:self.result[2][0]] + lines[0] = lines[0][self.result[1][1]:] + if len(lines) == 1: + lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] + lines[-1] = lines[-1][:self.result[2][1]] + return '\n'.join(lines).strip() + +def get_element_by_id(id, html): + """Return the content of the tag with the specified id in the passed HTML document""" + parser = IDParser(id) + try: + parser.loads(html) + except HTMLParser.HTMLParseError: + pass + return parser.get_result() + + def preferredencoding(): """Get preferred encoding. @@ -241,9 +303,21 @@ def htmlentity_transform(matchobj): return (u'&%s;' % entity) +def clean_html(html): + """Clean an HTML snippet into a readable string""" + # Newline vs
+ html = html.replace('\n', ' ') + html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) + # Strip html tags + html = re.sub('<.*?>', '', html) + # Replace html entities + html = _unescapeHTML(html) + return html + + def sanitize_title(utitle): """Sanitizes a video title so it could be used as part of a filename.""" - utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) + utitle = _unescapeHTML(utitle) return utitle.replace(unicode(os.sep), u'%') @@ -300,8 +374,8 @@ def _unescapeHTML(s): """ assert type(s) == type(u'') - htmlParser = HTMLParser.HTMLParser() - return htmlParser.unescape(s) + result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s) + return result def _encodeFilename(s): """ @@ -490,6 +564,8 @@ class FileDownloader(object): updatetime: Use the Last-modified header to set output file timestamps. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file + writesubtitles: Write the video subtitles to a .srt file + subtitleslang: Language of the subtitles to download """ params = None @@ -681,6 +757,10 @@ class FileDownloader(object): """ Report that the description file is being written """ self.to_screen(u'[info] Writing video description to: ' + descfn) + def report_writesubtitles(self, srtfn): + """ Report that the subtitles file is being written """ + self.to_screen(u'[info] Writing video subtitles to: ' + srtfn) + def report_writeinfojson(self, infofn): """ Report that the metadata file has been written """ self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) @@ -808,6 +888,21 @@ class FileDownloader(object): except (OSError, IOError): self.trouble(u'ERROR: Cannot write description file ' + descfn) return + + if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: + # subtitles download errors are already managed as troubles in relevant IE + # that way it will silently go on when used with unsupporting IE + try: + srtfn = filename.rsplit('.', 1)[0] + u'.srt' + self.report_writesubtitles(srtfn) + srtfile = open(_encodeFilename(srtfn), 'wb') + try: + srtfile.write(info_dict['subtitles'].encode('utf-8')) + finally: + srtfile.close() + except (OSError, IOError): + self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) + return if self.params.get('writeinfojson', False): infofn = filename + u'.info.json' @@ -1206,6 +1301,10 @@ class YoutubeIE(InfoExtractor): """Report attempt to download video info webpage.""" self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) + def report_video_subtitles_download(self, video_id): + """Report attempt to download video info webpage.""" + self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) + def report_information_extraction(self, video_id): """Report attempt to extract video information.""" self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) @@ -1218,6 +1317,23 @@ class YoutubeIE(InfoExtractor): """Indicate the download will use the RTMP protocol.""" self._downloader.to_screen(u'[youtube] RTMP download detected') + def _closed_captions_xml_to_srt(self, xml_string): + srt = '' + texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) + # TODO parse xml instead of regex + for n, (start, dur_tag, dur, caption) in enumerate(texts): + if not dur: dur = '4' + start = float(start) + end = start + float(dur) + start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) + end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) + caption = _unescapeHTML(caption) + caption = _unescapeHTML(caption) # double cycle, inentional + srt += str(n) + '\n' + srt += start + ' --> ' + end + '\n' + srt += caption + '\n\n' + return srt + def _print_formats(self, formats): print 'Available formats:' for x in formats: @@ -1377,18 +1493,40 @@ class YoutubeIE(InfoExtractor): pass # description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_description = mobj.group(1).decode('utf-8') - else: - html_parser = lxml.etree.HTMLParser(encoding='utf-8') - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) - # TODO use another parser + video_description = get_element_by_id("eow-description", video_webpage) + if video_description: video_description = clean_html(video_description.decode('utf8')) + else: video_description = '' + + # closed captions + video_subtitles = None + if self._downloader.params.get('writesubtitles', False): + self.report_video_subtitles_download(video_id) + request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + srt_list = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + else: + srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list) + if srt_lang_list: + if self._downloader.params.get('subtitleslang', False): + srt_lang = self._downloader.params.get('subtitleslang') + elif 'en' in srt_lang_list: + srt_lang = 'en' + else: + srt_lang = srt_lang_list[0] + if not srt_lang in srt_lang_list: + self._downloader.trouble(u'WARNING: no closed captions found in the specified language') + else: + request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) + try: + srt_xml = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + else: + video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) + else: + self._downloader.trouble(u'WARNING: video has no closed captions') # token video_token = urllib.unquote_plus(video_info['token'][0]) @@ -1461,6 +1599,7 @@ class YoutubeIE(InfoExtractor): 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, 'player_url': player_url, + 'subtitles': video_subtitles }) except UnavailableVideoError, err: self._downloader.trouble(u'\nERROR: unable to download video') @@ -2007,7 +2146,7 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'ERROR: Unable to extract media URL') return video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8') - video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url) + video_url = _unescapeHTML(video_url) try: # Process video information @@ -2058,7 +2197,7 @@ class VimeoIE(InfoExtractor): video_id = mobj.group(1) # Retrieve video webpage to extract further information - request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers) + request = urllib2.Request(url, None, std_headers) try: self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() @@ -2071,77 +2210,66 @@ class VimeoIE(InfoExtractor): # and latter we extract those that are Vimeo specific. self.report_extraction(video_id) - # Extract title - mobj = re.search(r'(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video title') + # Extract the config JSON + config = webpage.split(' = {config:')[1].split(',assets:')[0] + try: + config = json.loads(config) + except: + self._downloader.trouble(u'ERROR: unable to extract info section') return - video_title = mobj.group(1).decode('utf-8') + + # Extract title + video_title = config["video"]["title"] simple_title = _simplify_title(video_title) # Extract uploader - mobj = re.search(r'http://vimeo.com/(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video uploader') - return - video_uploader = mobj.group(1).decode('utf-8') + video_uploader = config["video"]["owner"]["name"] # Extract video thumbnail - mobj = re.search(r'(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video thumbnail') - return - video_thumbnail = mobj.group(1).decode('utf-8') + video_thumbnail = config["video"]["thumbnail"] - # # Extract video description - # mobj = re.search(r'', webpage) - # if mobj is None: - # self._downloader.trouble(u'ERROR: unable to extract video description') - # return - # video_description = mobj.group(1).decode('utf-8') - # if not video_description: video_description = 'No description available.' - video_description = 'Foo.' - - # Vimeo specific: extract request signature - mobj = re.search(r'(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract request signature') - return - sig = mobj.group(1).decode('utf-8') - - # Vimeo specific: extract video quality information - mobj = re.search(r'(\d+)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video quality information') - return - quality = mobj.group(1).decode('utf-8') + # Extract video description + video_description = get_element_by_id("description", webpage) + if video_description: video_description = clean_html(video_description.decode('utf8')) + else: video_description = '' - if int(quality) == 1: - quality = 'hd' + # Extract upload date + video_upload_date = u'NA' + mobj = re.search(r'', webpage) + if mobj is not None: + video_upload_date = mobj.group(1) + + # Vimeo specific: extract request signature and timestamp + sig = config['request']['signature'] + timestamp = config['request']['timestamp'] + + # Vimeo specific: extract video codec and quality information + # TODO bind to format param + codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] + for codec in codecs: + if codec[0] in config["video"]["files"]: + video_codec = codec[0] + video_extension = codec[1] + if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd' + else: quality = 'sd' + break else: - quality = 'sd' - - # Vimeo specific: Extract request signature expiration - mobj = re.search(r'(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract request signature expiration') + self._downloader.trouble(u'ERROR: no known codec found') return - sig_exp = mobj.group(1).decode('utf-8') - video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality) + video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ + %(video_id, sig, timestamp, quality, video_codec.upper()) try: # Process video information self._downloader.process_info({ - 'id': video_id.decode('utf-8'), + 'id': video_id, 'url': video_url, 'uploader': video_uploader, - 'upload_date': u'NA', + 'upload_date': video_upload_date, 'title': video_title, 'stitle': simple_title, - 'ext': u'mp4', - 'thumbnail': video_thumbnail.decode('utf-8'), - 'description': video_description, + 'ext': video_extension, 'thumbnail': video_thumbnail, 'description': video_description, 'player_url': None, @@ -2250,9 +2378,7 @@ class GenericIE(InfoExtractor): class YoutubeSearchIE(InfoExtractor): """Information Extractor for YouTube search queries.""" _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+' - _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en' - _VIDEO_INDICATOR = r'href="/watch\?v=.+?"' - _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*' + _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' _youtube_ie = None _max_youtube_results = 1000 IE_NAME = u'youtube:search' @@ -2303,45 +2429,39 @@ class YoutubeSearchIE(InfoExtractor): """Downloads a specified number of results for a query""" video_ids = [] - already_seen = set() - pagenum = 1 + pagenum = 0 + limit = n - while True: - self.report_download_page(query, pagenum) - result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) + while (50 * pagenum) < limit: + self.report_download_page(query, pagenum+1) + result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1) request = urllib2.Request(result_url) try: - page = urllib2.urlopen(request).read() + data = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err)) return + api_response = json.loads(data)['data'] - # Extract video identifiers - for mobj in re.finditer(self._VIDEO_INDICATOR, page): - video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1] - if video_id not in already_seen: - video_ids.append(video_id) - already_seen.add(video_id) - if len(video_ids) == n: - # Specified n videos reached - for id in video_ids: - self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) - return + new_ids = list(video['id'] for video in api_response['items']) + video_ids += new_ids - if re.search(self._MORE_PAGES_INDICATOR, page) is None: - for id in video_ids: - self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) - return + limit = min(n, api_response['totalItems']) + pagenum += 1 - pagenum = pagenum + 1 + if len(video_ids) > n: + video_ids = video_ids[:n] + for id in video_ids: + self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) + return class GoogleSearchIE(InfoExtractor): """Information Extractor for Google Video search queries.""" _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+' _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en' - _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&' - _MORE_PAGES_INDICATOR = r'Next' + _VIDEO_INDICATOR = r'\s*Next\s*' _youtube_ie = None IE_NAME = u'youtube:playlist' @@ -2571,7 +2689,7 @@ class YoutubePlaylistIE(InfoExtractor): # Extract video identifiers ids_in_page = [] - for mobj in re.finditer(self._VIDEO_INDICATOR, page): + for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page): if mobj.group(1) not in ids_in_page: ids_in_page.append(mobj.group(1)) video_ids.extend(ids_in_page) @@ -2582,7 +2700,10 @@ class YoutubePlaylistIE(InfoExtractor): playliststart = self._downloader.params.get('playliststart', 1) - 1 playlistend = self._downloader.params.get('playlistend', -1) - video_ids = video_ids[playliststart:playlistend] + if playlistend == -1: + video_ids = video_ids[playliststart:] + else: + video_ids = video_ids[playliststart:playlistend] for id in video_ids: self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) @@ -3277,8 +3398,6 @@ class EscapistIE(InfoExtractor): self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) def _real_extract(self, url): - htmlParser = HTMLParser.HTMLParser() - mobj = re.match(self._VALID_URL, url) if mobj is None: self._downloader.trouble(u'ERROR: invalid URL: %s' % url) @@ -3294,11 +3413,11 @@ class EscapistIE(InfoExtractor): return descMatch = re.search('([^<]+)', coursepage) if m: - info['title'] = unescapeHTML(m.group(1)) + info['title'] = _unescapeHTML(m.group(1)) else: info['title'] = info['id'] info['stitle'] = _simplify_title(info['title']) m = re.search('([^<]+)', coursepage) if m: - info['description'] = unescapeHTML(m.group(1)) + info['description'] = _unescapeHTML(m.group(1)) links = _orderedSet(re.findall('', coursepage)) info['list'] = [ { 'type': 'reference', - 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), + 'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(vpage), } for vpage in links] @@ -3881,8 +3990,6 @@ class StanfordOpenClassroomIE(InfoExtractor): assert entry['type'] == 'reference' self.extract(entry['url']) else: # Root page - unescapeHTML = HTMLParser.HTMLParser().unescape - info = { 'id': 'Stanford OpenClassroom', 'type': 'playlist', @@ -3903,7 +4010,7 @@ class StanfordOpenClassroomIE(InfoExtractor): info['list'] = [ { 'type': 'reference', - 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), + 'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(cpage), } for cpage in links] @@ -4328,6 +4435,12 @@ def parseOpts(): action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') video_format.add_option('-F', '--list-formats', action='store_true', dest='listformats', help='list all available formats (currently youtube only)') + video_format.add_option('--write-srt', + action='store_true', dest='writesubtitles', + help='write video closed captions to a .srt file (currently youtube only)', default=False) + video_format.add_option('--srt-lang', + action='store', dest='subtitleslang', metavar='LANG', + help='language of the closed captions to download (optional) use IETF language tags like \'en\'') verbosity.add_option('-q', '--quiet', @@ -4592,6 +4705,8 @@ def _real_main(): 'updatetime': opts.updatetime, 'writedescription': opts.writedescription, 'writeinfojson': opts.writeinfojson, + 'writesubtitles': opts.writesubtitles, + 'subtitleslang': opts.subtitleslang, 'matchtitle': opts.matchtitle, 'rejecttitle': opts.rejecttitle, 'max_downloads': opts.max_downloads,