ignore parsing errors in get_element_by_id()

[youtube-dl] / youtube-dl
diff --git a/youtube-dl b/youtube-dl

index 5c81973cd4ea33c1c470fa44d4940dce2f7ef90c..dc7ec136fcbd27957633ccf037bad6e4e136cb43 100755 (executable)
--- a/youtube-dl
+++ b/youtube-dl
@@ -15,6 +15,7 @@ __authors__  = (
         'Kevin Ngo',
         'Ori Avtalion',
         'shizeeg',
+       'Filippo Valsorda',
         )
  
  __license__ = 'Public Domain'
@@ -66,11 +67,6 @@ try:
  except ImportError:
         from cgi import parse_qs
  
-try:
-       import lxml.etree
-except ImportError:
-       pass # Handled below
-
  try:
         import xml.etree.ElementTree
  except ImportError: # Python<2.5: Not officially supported, but let it slip
@@ -197,6 +193,72 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr
                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
                         return res
  
+
+class IDParser(HTMLParser.HTMLParser):
+       """Modified HTMLParser that isolates a tag with the specified id"""
+       def __init__(self, id):
+               self.id = id
+               self.result = None
+               self.started = False
+               self.depth = {}
+               self.html = None
+               self.watch_startpos = False
+               HTMLParser.HTMLParser.__init__(self)
+
+       def loads(self, html):
+               self.html = html
+               self.feed(html)
+               self.close()
+
+       def handle_starttag(self, tag, attrs):
+               attrs = dict(attrs)
+               if self.started:
+                       self.find_startpos(None)
+               if 'id' in attrs and attrs['id'] == self.id:
+                       self.result = [tag]
+                       self.started = True
+                       self.watch_startpos = True
+               if self.started:
+                       if not tag in self.depth: self.depth[tag] = 0
+                       self.depth[tag] += 1
+
+       def handle_endtag(self, tag):
+               if self.started:
+                       if tag in self.depth: self.depth[tag] -= 1
+                       if self.depth[self.result[0]] == 0:
+                               self.started = False
+                               self.result.append(self.getpos())
+
+       def find_startpos(self, x):
+               """Needed to put the start position of the result (self.result[1])
+               after the opening tag with the requested id"""
+               if self.watch_startpos:
+                       self.watch_startpos = False
+                       self.result.append(self.getpos())
+       handle_entityref = handle_charref = handle_data = handle_comment = \
+       handle_decl = handle_pi = unknown_decl = find_startpos
+
+       def get_result(self):
+               if self.result == None: return None
+               if len(self.result) != 3: return None
+               lines = self.html.split('\n')
+               lines = lines[self.result[1][0]-1:self.result[2][0]]
+               lines[0] = lines[0][self.result[1][1]:]
+               if len(lines) == 1:
+                       lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
+               lines[-1] = lines[-1][:self.result[2][1]]
+               return '\n'.join(lines).strip()
+
+def get_element_by_id(id, html):
+       """Return the content of the tag with the specified id in the passed HTML document"""
+       parser = IDParser(id)
+       try:
+               parser.loads(html)
+       except HTMLParser.HTMLParseError:
+               pass
+       return parser.get_result()
+
+
  def preferredencoding():
         """Get preferred encoding.
  
@@ -241,9 +303,21 @@ def htmlentity_transform(matchobj):
         return (u'&%s;' % entity)
  
  
+def clean_html(html):
+       """Clean an HTML snippet into a readable string"""
+       # Newline vs <br />
+       html = html.replace('\n', ' ')
+       html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
+       # Strip html tags
+       html = re.sub('<.*?>', '', html)
+       # Replace html entities
+       html = _unescapeHTML(html)
+       return html
+
+
  def sanitize_title(utitle):
         """Sanitizes a video title so it could be used as part of a filename."""
-       utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
+       utitle = _unescapeHTML(utitle)
         return utitle.replace(unicode(os.sep), u'%')
  
  
@@ -300,8 +374,8 @@ def _unescapeHTML(s):
         """
         assert type(s) == type(u'')
  
-       htmlParser = HTMLParser.HTMLParser()
-       return htmlParser.unescape(s)
+       result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
+       return result
  
  def _encodeFilename(s):
         """
@@ -491,6 +565,7 @@ class FileDownloader(object):
         writedescription: Write the video description to a .description file
         writeinfojson:    Write the video description to a .info.json file
         writesubtitles:   Write the video subtitles to a .srt file
+       subtitleslang:    Language of the subtitles to download
         """
  
         params = None
@@ -1252,8 +1327,8 @@ class YoutubeIE(InfoExtractor):
                         end = start + float(dur)
                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
-                       caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
-                       caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
+                       caption = _unescapeHTML(caption)
+                       caption = _unescapeHTML(caption) # double cycle, inentional
                         srt += str(n) + '\n'
                         srt += start + ' --> ' + end + '\n'
                         srt += caption + '\n\n'
@@ -1418,18 +1493,9 @@ class YoutubeIE(InfoExtractor):
                                         pass
  
                 # description
-               try:
-                       lxml.etree
-               except NameError:
-                       video_description = u'No description available.'
-                       mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
-                       if mobj is not None:
-                               video_description = mobj.group(1).decode('utf-8')
-               else:
-                       html_parser = lxml.etree.HTMLParser(encoding='utf-8')
-                       vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
-                       video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
-                       # TODO use another parser
+               video_description = get_element_by_id("eow-description", video_webpage)
+               if video_description: video_description = clean_html(video_description.decode('utf8'))
+               else: video_description = ''
                         
                 # closed captions
                 video_subtitles = None
@@ -1443,17 +1509,24 @@ class YoutubeIE(InfoExtractor):
                         else:
                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
                                 if srt_lang_list:
-                                       if 'en' in srt_lang_list: srt_lang = 'en'
-                                       else: srt_lang = srt_lang_list[0] # TODO choose better and provide an override
-                                       request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
-                                       try:
-                                               srt_xml = urllib2.urlopen(request).read()
-                                       except (urllib2.URLError, httplib.HTTPException, socket.error), err:
-                                               self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
+                                       if self._downloader.params.get('subtitleslang', False):
+                                               srt_lang = self._downloader.params.get('subtitleslang')
+                                       elif 'en' in srt_lang_list:
+                                               srt_lang = 'en'
+                                       else:
+                                               srt_lang = srt_lang_list[0]
+                                       if not srt_lang in srt_lang_list:
+                                               self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
                                         else:
-                                               video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
+                                               request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
+                                               try:
+                                                       srt_xml = urllib2.urlopen(request).read()
+                                               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                                                       self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
+                                               else:
+                                                       video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
                                 else:
-                                       self._downloader.trouble(u'WARNING: video has no subtitles')
+                                       self._downloader.trouble(u'WARNING: video has no closed captions')
  
                 # token
                 video_token = urllib.unquote_plus(video_info['token'][0])
@@ -2073,7 +2146,7 @@ class YahooIE(InfoExtractor):
                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
                         return
                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
-               video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
+               video_url = _unescapeHTML(video_url)
  
                 try:
                         # Process video information
@@ -2156,18 +2229,9 @@ class VimeoIE(InfoExtractor):
                 video_thumbnail = config["video"]["thumbnail"]
  
                 # Extract video description
-               try:
-                       lxml.etree
-               except NameError:
-                       video_description = u'No description available.'
-                       mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
-                       if mobj is not None:
-                               video_description = mobj.group(1)
-               else:
-                       html_parser = lxml.etree.HTMLParser()
-                       vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
-                       video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
-                       # TODO use another parser
+               video_description = get_element_by_id("description", webpage)
+               if video_description: video_description = clean_html(video_description.decode('utf8'))
+               else: video_description = ''
  
                 # Extract upload date
                 video_upload_date = u'NA'
@@ -2314,9 +2378,7 @@ class GenericIE(InfoExtractor):
  class YoutubeSearchIE(InfoExtractor):
         """Information Extractor for YouTube search queries."""
         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
-       _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
-       _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
-       _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
+       _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
         _youtube_ie = None
         _max_youtube_results = 1000
         IE_NAME = u'youtube:search'
@@ -2367,37 +2429,31 @@ class YoutubeSearchIE(InfoExtractor):
                 """Downloads a specified number of results for a query"""
  
                 video_ids = []
-               already_seen = set()
-               pagenum = 1
+               pagenum = 0
+               limit = n
  
-               while True:
-                       self.report_download_page(query, pagenum)
-                       result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
+               while (50 * pagenum) < limit:
+                       self.report_download_page(query, pagenum+1)
+                       result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
                         request = urllib2.Request(result_url)
                         try:
-                               page = urllib2.urlopen(request).read()
+                               data = urllib2.urlopen(request).read()
                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
-                               self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
+                               self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
                                 return
+                       api_response = json.loads(data)['data']
  
-                       # Extract video identifiers
-                       for mobj in re.finditer(self._VIDEO_INDICATOR, page):
-                               video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
-                               if video_id not in already_seen:
-                                       video_ids.append(video_id)
-                                       already_seen.add(video_id)
-                                       if len(video_ids) == n:
-                                               # Specified n videos reached
-                                               for id in video_ids:
-                                                       self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
-                                               return
+                       new_ids = list(video['id'] for video in api_response['items'])
+                       video_ids += new_ids
  
-                       if re.search(self._MORE_PAGES_INDICATOR, page) is None:
-                               for id in video_ids:
-                                       self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
-                               return
+                       limit = min(n, api_response['totalItems'])
+                       pagenum += 1
  
-                       pagenum = pagenum + 1
+               if len(video_ids) > n:
+                       video_ids = video_ids[:n]
+               for id in video_ids:
+                       self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
+               return
  
  
  class GoogleSearchIE(InfoExtractor):
@@ -2581,7 +2637,7 @@ class YoutubePlaylistIE(InfoExtractor):
  
         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
-       _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
+       _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
         _youtube_ie = None
         IE_NAME = u'youtube:playlist'
@@ -2633,7 +2689,7 @@ class YoutubePlaylistIE(InfoExtractor):
  
                         # Extract video identifiers
                         ids_in_page = []
-                       for mobj in re.finditer(self._VIDEO_INDICATOR, page):
+                       for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
                                 if mobj.group(1) not in ids_in_page:
                                         ids_in_page.append(mobj.group(1))
                         video_ids.extend(ids_in_page)
@@ -2644,7 +2700,10 @@ class YoutubePlaylistIE(InfoExtractor):
  
                 playliststart = self._downloader.params.get('playliststart', 1) - 1
                 playlistend = self._downloader.params.get('playlistend', -1)
-               video_ids = video_ids[playliststart:playlistend]
+               if playlistend == -1:
+                       video_ids = video_ids[playliststart:]
+               else:
+                       video_ids = video_ids[playliststart:playlistend]
  
                 for id in video_ids:
                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
@@ -3339,8 +3398,6 @@ class EscapistIE(InfoExtractor):
                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
  
         def _real_extract(self, url):
-               htmlParser = HTMLParser.HTMLParser()
-
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3356,11 +3413,11 @@ class EscapistIE(InfoExtractor):
                         return
  
                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
-               description = htmlParser.unescape(descMatch.group(1))
+               description = _unescapeHTML(descMatch.group(1))
                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
-               imgUrl = htmlParser.unescape(imgMatch.group(1))
+               imgUrl = _unescapeHTML(imgMatch.group(1))
                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
-               playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
+               playerUrl = _unescapeHTML(playerUrlMatch.group(1))
                 configUrlMatch = re.search('config=(.*)$', playerUrl)
                 configUrl = urllib2.unquote(configUrlMatch.group(1))
  
@@ -3419,8 +3476,6 @@ class CollegeHumorIE(InfoExtractor):
                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
  
         def _real_extract(self, url):
-               htmlParser = HTMLParser.HTMLParser()
-
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3491,8 +3546,6 @@ class XVideosIE(InfoExtractor):
                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
  
         def _real_extract(self, url):
-               htmlParser = HTMLParser.HTMLParser()
-
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3581,8 +3634,6 @@ class SoundcloudIE(InfoExtractor):
                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
  
         def _real_extract(self, url):
-               htmlParser = HTMLParser.HTMLParser()
-
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3670,8 +3721,6 @@ class InfoQIE(InfoExtractor):
                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
  
         def _real_extract(self, url):
-               htmlParser = HTMLParser.HTMLParser()
-
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3905,8 +3954,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
                         except UnavailableVideoError, err:
                                 self._downloader.trouble(u'\nERROR: unable to download video')
                 elif mobj.group('course'): # A course page
-                       unescapeHTML = HTMLParser.HTMLParser().unescape
-
                         course = mobj.group('course')
                         info = {
                                 'id': _simplify_title(course),
@@ -3922,20 +3969,20 @@ class StanfordOpenClassroomIE(InfoExtractor):
  
                         m = re.search('<h1>([^<]+)</h1>', coursepage)
                         if m:
-                               info['title'] = unescapeHTML(m.group(1))
+                               info['title'] = _unescapeHTML(m.group(1))
                         else:
                                 info['title'] = info['id']
                         info['stitle'] = _simplify_title(info['title'])
  
                         m = re.search('<description>([^<]+)</description>', coursepage)
                         if m:
-                               info['description'] = unescapeHTML(m.group(1))
+                               info['description'] = _unescapeHTML(m.group(1))
  
                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
                         info['list'] = [
                                 {
                                         'type': 'reference',
-                                       'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
+                                       'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(vpage),
                                 }
                                         for vpage in links]
  
@@ -3943,8 +3990,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
                                 assert entry['type'] == 'reference'
                                 self.extract(entry['url'])
                 else: # Root page
-                       unescapeHTML = HTMLParser.HTMLParser().unescape
-
                         info = {
                                 'id': 'Stanford OpenClassroom',
                                 'type': 'playlist',
@@ -3965,7 +4010,7 @@ class StanfordOpenClassroomIE(InfoExtractor):
                         info['list'] = [
                                 {
                                         'type': 'reference',
-                                       'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
+                                       'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(cpage),
                                 }
                                         for cpage in links]
  
@@ -4390,6 +4435,12 @@ def parseOpts():
                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
         video_format.add_option('-F', '--list-formats',
                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
+       video_format.add_option('--write-srt',
+                       action='store_true', dest='writesubtitles',
+                       help='write video closed captions to a .srt file (currently youtube only)', default=False)
+       video_format.add_option('--srt-lang',
+                       action='store', dest='subtitleslang', metavar='LANG',
+                       help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
  
  
         verbosity.add_option('-q', '--quiet',
@@ -4454,9 +4505,6 @@ def parseOpts():
         filesystem.add_option('--write-info-json',
                         action='store_true', dest='writeinfojson',
                         help='write video metadata to a .info.json file', default=False)
-       filesystem.add_option('--write-srt',
-                       action='store_true', dest='writesubtitles',
-                       help='write video subtitles to a .srt file', default=False)
  
  
         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
@@ -4658,6 +4706,7 @@ def _real_main():
                 'writedescription': opts.writedescription,
                 'writeinfojson': opts.writeinfojson,
                 'writesubtitles': opts.writesubtitles,
+               'subtitleslang': opts.subtitleslang,
                 'matchtitle': opts.matchtitle,
                 'rejecttitle': opts.rejecttitle,
                 'max_downloads': opts.max_downloads,