ignore parsing errors in get_element_by_id()

[youtube-dl] / youtube_dl / __init__.py
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py

index 3b9e2d70e6da720b53f4c37b6815727c37721338..dc7ec136fcbd27957633ccf037bad6e4e136cb43 100755 (executable)
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -15,6 +15,7 @@ __authors__  = (
         'Kevin Ngo',
         'Ori Avtalion',
         'shizeeg',
+       'Filippo Valsorda',
         )
  
  __license__ = 'Public Domain'
@@ -66,11 +67,6 @@ try:
  except ImportError:
         from cgi import parse_qs
  
-try:
-       import lxml.etree
-except ImportError:
-       pass # Handled below
-
  try:
         import xml.etree.ElementTree
  except ImportError: # Python<2.5: Not officially supported, but let it slip
@@ -197,6 +193,72 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr
                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
                         return res
  
+
+class IDParser(HTMLParser.HTMLParser):
+       """Modified HTMLParser that isolates a tag with the specified id"""
+       def __init__(self, id):
+               self.id = id
+               self.result = None
+               self.started = False
+               self.depth = {}
+               self.html = None
+               self.watch_startpos = False
+               HTMLParser.HTMLParser.__init__(self)
+
+       def loads(self, html):
+               self.html = html
+               self.feed(html)
+               self.close()
+
+       def handle_starttag(self, tag, attrs):
+               attrs = dict(attrs)
+               if self.started:
+                       self.find_startpos(None)
+               if 'id' in attrs and attrs['id'] == self.id:
+                       self.result = [tag]
+                       self.started = True
+                       self.watch_startpos = True
+               if self.started:
+                       if not tag in self.depth: self.depth[tag] = 0
+                       self.depth[tag] += 1
+
+       def handle_endtag(self, tag):
+               if self.started:
+                       if tag in self.depth: self.depth[tag] -= 1
+                       if self.depth[self.result[0]] == 0:
+                               self.started = False
+                               self.result.append(self.getpos())
+
+       def find_startpos(self, x):
+               """Needed to put the start position of the result (self.result[1])
+               after the opening tag with the requested id"""
+               if self.watch_startpos:
+                       self.watch_startpos = False
+                       self.result.append(self.getpos())
+       handle_entityref = handle_charref = handle_data = handle_comment = \
+       handle_decl = handle_pi = unknown_decl = find_startpos
+
+       def get_result(self):
+               if self.result == None: return None
+               if len(self.result) != 3: return None
+               lines = self.html.split('\n')
+               lines = lines[self.result[1][0]-1:self.result[2][0]]
+               lines[0] = lines[0][self.result[1][1]:]
+               if len(lines) == 1:
+                       lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
+               lines[-1] = lines[-1][:self.result[2][1]]
+               return '\n'.join(lines).strip()
+
+def get_element_by_id(id, html):
+       """Return the content of the tag with the specified id in the passed HTML document"""
+       parser = IDParser(id)
+       try:
+               parser.loads(html)
+       except HTMLParser.HTMLParseError:
+               pass
+       return parser.get_result()
+
+
  def preferredencoding():
         """Get preferred encoding.
  
@@ -241,9 +303,21 @@ def htmlentity_transform(matchobj):
         return (u'&%s;' % entity)
  
  
+def clean_html(html):
+       """Clean an HTML snippet into a readable string"""
+       # Newline vs <br />
+       html = html.replace('\n', ' ')
+       html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
+       # Strip html tags
+       html = re.sub('<.*?>', '', html)
+       # Replace html entities
+       html = _unescapeHTML(html)
+       return html
+
+
  def sanitize_title(utitle):
         """Sanitizes a video title so it could be used as part of a filename."""
-       utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
+       utitle = _unescapeHTML(utitle)
         return utitle.replace(unicode(os.sep), u'%')
  
  
@@ -300,8 +374,8 @@ def _unescapeHTML(s):
         """
         assert type(s) == type(u'')
  
-       htmlParser = HTMLParser.HTMLParser()
-       return htmlParser.unescape(s)
+       result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
+       return result
  
  def _encodeFilename(s):
         """
@@ -490,6 +564,8 @@ class FileDownloader(object):
         updatetime:       Use the Last-modified header to set output file timestamps.
         writedescription: Write the video description to a .description file
         writeinfojson:    Write the video description to a .info.json file
+       writesubtitles:   Write the video subtitles to a .srt file
+       subtitleslang:    Language of the subtitles to download
         """
  
         params = None
@@ -681,6 +757,10 @@ class FileDownloader(object):
                 """ Report that the description file is being written """
                 self.to_screen(u'[info] Writing video description to: ' + descfn)
  
+       def report_writesubtitles(self, srtfn):
+               """ Report that the subtitles file is being written """
+               self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
+
         def report_writeinfojson(self, infofn):
                 """ Report that the metadata file has been written """
                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
@@ -808,6 +888,21 @@ class FileDownloader(object):
                         except (OSError, IOError):
                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
                                 return
+                               
+               if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
+                       # subtitles download errors are already managed as troubles in relevant IE
+                       # that way it will silently go on when used with unsupporting IE 
+                       try:
+                               srtfn = filename.rsplit('.', 1)[0] + u'.srt'
+                               self.report_writesubtitles(srtfn)
+                               srtfile = open(_encodeFilename(srtfn), 'wb')
+                               try:
+                                       srtfile.write(info_dict['subtitles'].encode('utf-8'))
+                               finally:
+                                       srtfile.close()
+                       except (OSError, IOError):
+                               self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
+                               return
  
                 if self.params.get('writeinfojson', False):
                         infofn = filename + u'.info.json'
@@ -1206,6 +1301,10 @@ class YoutubeIE(InfoExtractor):
                 """Report attempt to download video info webpage."""
                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
  
+       def report_video_subtitles_download(self, video_id):
+               """Report attempt to download video info webpage."""
+               self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
+
         def report_information_extraction(self, video_id):
                 """Report attempt to extract video information."""
                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
@@ -1218,6 +1317,23 @@ class YoutubeIE(InfoExtractor):
                 """Indicate the download will use the RTMP protocol."""
                 self._downloader.to_screen(u'[youtube] RTMP download detected')
  
+       def _closed_captions_xml_to_srt(self, xml_string):
+               srt = ''
+               texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
+               # TODO parse xml instead of regex
+               for n, (start, dur_tag, dur, caption) in enumerate(texts):
+                       if not dur: dur = '4'
+                       start = float(start)
+                       end = start + float(dur)
+                       start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
+                       end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
+                       caption = _unescapeHTML(caption)
+                       caption = _unescapeHTML(caption) # double cycle, inentional
+                       srt += str(n) + '\n'
+                       srt += start + ' --> ' + end + '\n'
+                       srt += caption + '\n\n'
+               return srt
+
         def _print_formats(self, formats):
                 print 'Available formats:'
                 for x in formats:
@@ -1377,18 +1493,40 @@ class YoutubeIE(InfoExtractor):
                                         pass
  
                 # description
-               try:
-                       lxml.etree
-               except NameError:
-                       video_description = u'No description available.'
-                       mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
-                       if mobj is not None:
-                               video_description = mobj.group(1).decode('utf-8')
-               else:
-                       html_parser = lxml.etree.HTMLParser(encoding='utf-8')
-                       vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
-                       video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
-                       # TODO use another parser
+               video_description = get_element_by_id("eow-description", video_webpage)
+               if video_description: video_description = clean_html(video_description.decode('utf8'))
+               else: video_description = ''
+                       
+               # closed captions
+               video_subtitles = None
+               if self._downloader.params.get('writesubtitles', False):
+                       self.report_video_subtitles_download(video_id)
+                       request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
+                       try:
+                               srt_list = urllib2.urlopen(request).read()
+                       except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                               self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
+                       else:
+                               srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
+                               if srt_lang_list:
+                                       if self._downloader.params.get('subtitleslang', False):
+                                               srt_lang = self._downloader.params.get('subtitleslang')
+                                       elif 'en' in srt_lang_list:
+                                               srt_lang = 'en'
+                                       else:
+                                               srt_lang = srt_lang_list[0]
+                                       if not srt_lang in srt_lang_list:
+                                               self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
+                                       else:
+                                               request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
+                                               try:
+                                                       srt_xml = urllib2.urlopen(request).read()
+                                               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                                                       self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
+                                               else:
+                                                       video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
+                               else:
+                                       self._downloader.trouble(u'WARNING: video has no closed captions')
  
                 # token
                 video_token = urllib.unquote_plus(video_info['token'][0])
@@ -1461,6 +1599,7 @@ class YoutubeIE(InfoExtractor):
                                         'thumbnail':    video_thumbnail.decode('utf-8'),
                                         'description':  video_description,
                                         'player_url':   player_url,
+                                       'subtitles':    video_subtitles
                                 })
                         except UnavailableVideoError, err:
                                 self._downloader.trouble(u'\nERROR: unable to download video')
@@ -2007,7 +2146,7 @@ class YahooIE(InfoExtractor):
                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
                         return
                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
-               video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
+               video_url = _unescapeHTML(video_url)
  
                 try:
                         # Process video information
@@ -2090,18 +2229,9 @@ class VimeoIE(InfoExtractor):
                 video_thumbnail = config["video"]["thumbnail"]
  
                 # Extract video description
-               try:
-                       lxml.etree
-               except NameError:
-                       video_description = u'No description available.'
-                       mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
-                       if mobj is not None:
-                               video_description = mobj.group(1)
-               else:
-                       html_parser = lxml.etree.HTMLParser()
-                       vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
-                       video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
-                       # TODO use another parser
+               video_description = get_element_by_id("description", webpage)
+               if video_description: video_description = clean_html(video_description.decode('utf8'))
+               else: video_description = ''
  
                 # Extract upload date
                 video_upload_date = u'NA'
@@ -3268,8 +3398,6 @@ class EscapistIE(InfoExtractor):
                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
  
         def _real_extract(self, url):
-               htmlParser = HTMLParser.HTMLParser()
-
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3285,11 +3413,11 @@ class EscapistIE(InfoExtractor):
                         return
  
                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
-               description = htmlParser.unescape(descMatch.group(1))
+               description = _unescapeHTML(descMatch.group(1))
                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
-               imgUrl = htmlParser.unescape(imgMatch.group(1))
+               imgUrl = _unescapeHTML(imgMatch.group(1))
                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
-               playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
+               playerUrl = _unescapeHTML(playerUrlMatch.group(1))
                 configUrlMatch = re.search('config=(.*)$', playerUrl)
                 configUrl = urllib2.unquote(configUrlMatch.group(1))
  
@@ -3348,8 +3476,6 @@ class CollegeHumorIE(InfoExtractor):
                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
  
         def _real_extract(self, url):
-               htmlParser = HTMLParser.HTMLParser()
-
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3420,8 +3546,6 @@ class XVideosIE(InfoExtractor):
                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
  
         def _real_extract(self, url):
-               htmlParser = HTMLParser.HTMLParser()
-
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3510,8 +3634,6 @@ class SoundcloudIE(InfoExtractor):
                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
  
         def _real_extract(self, url):
-               htmlParser = HTMLParser.HTMLParser()
-
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3599,8 +3721,6 @@ class InfoQIE(InfoExtractor):
                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
  
         def _real_extract(self, url):
-               htmlParser = HTMLParser.HTMLParser()
-
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3834,8 +3954,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
                         except UnavailableVideoError, err:
                                 self._downloader.trouble(u'\nERROR: unable to download video')
                 elif mobj.group('course'): # A course page
-                       unescapeHTML = HTMLParser.HTMLParser().unescape
-
                         course = mobj.group('course')
                         info = {
                                 'id': _simplify_title(course),
@@ -3851,20 +3969,20 @@ class StanfordOpenClassroomIE(InfoExtractor):
  
                         m = re.search('<h1>([^<]+)</h1>', coursepage)
                         if m:
-                               info['title'] = unescapeHTML(m.group(1))
+                               info['title'] = _unescapeHTML(m.group(1))
                         else:
                                 info['title'] = info['id']
                         info['stitle'] = _simplify_title(info['title'])
  
                         m = re.search('<description>([^<]+)</description>', coursepage)
                         if m:
-                               info['description'] = unescapeHTML(m.group(1))
+                               info['description'] = _unescapeHTML(m.group(1))
  
                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
                         info['list'] = [
                                 {
                                         'type': 'reference',
-                                       'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
+                                       'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(vpage),
                                 }
                                         for vpage in links]
  
@@ -3872,8 +3990,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
                                 assert entry['type'] == 'reference'
                                 self.extract(entry['url'])
                 else: # Root page
-                       unescapeHTML = HTMLParser.HTMLParser().unescape
-
                         info = {
                                 'id': 'Stanford OpenClassroom',
                                 'type': 'playlist',
@@ -3894,7 +4010,7 @@ class StanfordOpenClassroomIE(InfoExtractor):
                         info['list'] = [
                                 {
                                         'type': 'reference',
-                                       'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
+                                       'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(cpage),
                                 }
                                         for cpage in links]
  
@@ -4319,6 +4435,12 @@ def parseOpts():
                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
         video_format.add_option('-F', '--list-formats',
                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
+       video_format.add_option('--write-srt',
+                       action='store_true', dest='writesubtitles',
+                       help='write video closed captions to a .srt file (currently youtube only)', default=False)
+       video_format.add_option('--srt-lang',
+                       action='store', dest='subtitleslang', metavar='LANG',
+                       help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
  
  
         verbosity.add_option('-q', '--quiet',
@@ -4583,6 +4705,8 @@ def _real_main():
                 'updatetime': opts.updatetime,
                 'writedescription': opts.writedescription,
                 'writeinfojson': opts.writeinfojson,
+               'writesubtitles': opts.writesubtitles,
+               'subtitleslang': opts.subtitleslang,
                 'matchtitle': opts.matchtitle,
                 'rejecttitle': opts.rejecttitle,
                 'max_downloads': opts.max_downloads,