[addanime] improve
authorPhilipp Hagemeister <phihag@phihag.de>
Wed, 28 Aug 2013 02:25:38 +0000 (04:25 +0200)
committerPhilipp Hagemeister <phihag@phihag.de>
Wed, 28 Aug 2013 02:25:38 +0000 (04:25 +0200)
1  2 
youtube_dl/extractor/__init__.py
youtube_dl/extractor/appletrailers.py
youtube_dl/extractor/common.py
youtube_dl/utils.py

index 2f86f2acaca564f1db652a837dee702507bed8f7,28dcb2cc4ae830d3b1a4debafae02e3a1d66dafa..c76b99a81687e0904bbc147a4e592fd130cfa227
@@@ -1,4 -1,5 +1,5 @@@
 -
 -from .AddAnime import AddAnimeIE
 +from .appletrailers import AppleTrailersIE
++from .addanime import AddAnimeIE
  from .archiveorg import ArchiveOrgIE
  from .ard import ARDIE
  from .arte import ArteTvIE
@@@ -7,10 -8,7 +8,10 @@@ from .bandcamp import BandcampI
  from .bliptv import BlipTVIE, BlipTVUserIE
  from .breakcom import BreakIE
  from .brightcove import BrightcoveIE
 +from .c56 import C56IE
  from .canalplus import CanalplusIE
 +from .canalc2 import Canalc2IE
 +from .cnn import CNNIE
  from .collegehumor import CollegeHumorIE
  from .comedycentral import ComedyCentralIE
  from .condenast import CondeNastIE
@@@ -33,7 -31,6 +34,7 @@@ from .gametrailers import GametrailersI
  from .generic import GenericIE
  from .googleplus import GooglePlusIE
  from .googlesearch import GoogleSearchIE
 +from .hark import HarkIE
  from .hotnewhiphop import HotNewHipHopIE
  from .howcast import HowcastIE
  from .hypem import HypemIE
@@@ -41,7 -38,6 +42,7 @@@ from .ign import IGNIE, OneUPI
  from .ina import InaIE
  from .infoq import InfoQIE
  from .instagram import InstagramIE
 +from .jeuxvideo import JeuxVideoIE
  from .jukebox import JukeboxIE
  from .justintv import JustinTVIE
  from .kankan import KankanIE
@@@ -55,19 -51,14 +56,19 @@@ from .muzu import MuzuTVI
  from .myspass import MySpassIE
  from .myvideo import MyVideoIE
  from .nba import NBAIE
 +from .nbc import NBCNewsIE
  from .ooyala import OoyalaIE
 +from .pbs import PBSIE
  from .photobucket import PhotobucketIE
  from .pornotube import PornotubeIE
  from .rbmaradio import RBMARadioIE
  from .redtube import RedTubeIE
  from .ringtv import RingTVIE
 +from .ro220 import Ro220IE
  from .roxwel import RoxwelIE
 +from .rtlnow import RTLnowIE
  from .sina import SinaIE
 +from .slashdot import SlashdotIE
  from .soundcloud import SoundcloudIE, SoundcloudSetIE
  from .spiegel import SpiegelIE
  from .stanfordoc import StanfordOpenClassroomIE
@@@ -78,11 -69,9 +79,11 @@@ from .ted import TEDI
  from .tf1 import TF1IE
  from .thisav import ThisAVIE
  from .traileraddict import TrailerAddictIE
 +from .trilulilu import TriluliluIE
  from .tudou import TudouIE
  from .tumblr import TumblrIE
  from .tutv import TutvIE
 +from .unistra import UnistraIE
  from .ustream import UstreamIE
  from .vbox7 import Vbox7IE
  from .veoh import VeohIE
@@@ -90,6 -79,7 +91,6 @@@ from .vevo import VevoI
  from .videofyme import VideofyMeIE
  from .vimeo import VimeoIE, VimeoChannelIE
  from .vine import VineIE
 -from .c56 import C56IE
  from .wat import WatIE
  from .weibo import WeiboIE
  from .wimp import WimpIE
@@@ -123,14 -113,12 +124,14 @@@ _ALL_CLASSES = 
  ]
  _ALL_CLASSES.append(GenericIE)
  
 +
  def gen_extractors():
      """ Return a list of an instance of every supported extractor.
      The order does matter; the first extractor matched is the one handling the URL.
      """
      return [klass() for klass in _ALL_CLASSES]
  
 +
  def get_info_extractor(ie_name):
      """Returns the info extractor class with the given ie_name"""
      return globals()[ie_name+'IE']
index 7d126e2d24203b60343052acdb362d941fd7d56e,0000000000000000000000000000000000000000..b3bdb2955e974591c1a7afc5b20776261adeb193
mode 100644,000000..100644
--- /dev/null
@@@ -1,167 -1,0 +1,167 @@@
-             for formats_el in li.findall('.//li/a'):
 +import re
 +import xml.etree.ElementTree
 +
 +from .common import InfoExtractor
 +from ..utils import (
 +    determine_ext,
 +    ExtractorError,
 +)
 +
 +
 +class AppleTrailersIE(InfoExtractor):
 +    _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
 +    _TEST = {
 +        u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/",
 +        u"playlist": [
 +            {
 +                u"file": u"manofsteel-trailer4.mov",
 +                u"md5": u"11874af099d480cc09e103b189805d5f",
 +                u"info_dict": {
 +                    u"duration": 111,
 +                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg",
 +                    u"title": u"Trailer 4",
 +                    u"upload_date": u"20130523",
 +                    u"uploader_id": u"wb",
 +                },
 +            },
 +            {
 +                u"file": u"manofsteel-trailer3.mov",
 +                u"md5": u"07a0a262aae5afe68120eed61137ab34",
 +                u"info_dict": {
 +                    u"duration": 182,
 +                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg",
 +                    u"title": u"Trailer 3",
 +                    u"upload_date": u"20130417",
 +                    u"uploader_id": u"wb",
 +                },
 +            },
 +            {
 +                u"file": u"manofsteel-trailer.mov",
 +                u"md5": u"e401fde0813008e3307e54b6f384cff1",
 +                u"info_dict": {
 +                    u"duration": 148,
 +                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg",
 +                    u"title": u"Trailer",
 +                    u"upload_date": u"20121212",
 +                    u"uploader_id": u"wb",
 +                },
 +            },
 +            {
 +                u"file": u"manofsteel-teaser.mov",
 +                u"md5": u"76b392f2ae9e7c98b22913c10a639c97",
 +                u"info_dict": {
 +                    u"duration": 93,
 +                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg",
 +                    u"title": u"Teaser",
 +                    u"upload_date": u"20120721",
 +                    u"uploader_id": u"wb",
 +                },
 +            }
 +        ]
 +    }
 +
 +    def _real_extract(self, url):
 +        mobj = re.match(self._VALID_URL, url)
 +        movie = mobj.group('movie')
 +        uploader_id = mobj.group('company')
 +
 +        playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc'
 +        playlist_snippet = self._download_webpage(playlist_url, movie)
 +        playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet)
 +        playlist_html = u'<html>' + playlist_cleaned + u'</html>'
 +
 +        size_cache = {}
 +
 +        doc = xml.etree.ElementTree.fromstring(playlist_html)
 +        playlist = []
 +        for li in doc.findall('./div/ul/li'):
 +            title = li.find('.//h3').text
 +            video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
 +            thumbnail = li.find('.//img').attrib['src']
 +
 +            date_el = li.find('.//p')
 +            upload_date = None
 +            m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text)
 +            if m:
 +                upload_date = u'20' + m.group('year') + m.group('month') + m.group('day')
 +            runtime_el = date_el.find('./br')
 +            m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail)
 +            duration = None
 +            if m:
 +                duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
 +
 +            formats = []
++            for formats_el in li.findall('.//a'):
 +                if formats_el.attrib['class'] != 'OverlayPanel':
 +                    continue
 +                target = formats_el.attrib['target']
 +
 +                format_code = formats_el.text
 +                if 'Automatic' in format_code:
 +                    continue
 +
 +                size_q = formats_el.attrib['href']
 +                size_id = size_q.rpartition('#videos-')[2]
 +                if size_id not in size_cache:
 +                    size_url = url + size_q
 +                    sizepage_html = self._download_webpage(
 +                        size_url, movie,
 +                        note=u'Downloading size info %s' % size_id,
 +                        errnote=u'Error while downloading size info %s' % size_id,
 +                    )
 +                    _doc = xml.etree.ElementTree.fromstring(sizepage_html)
 +                    size_cache[size_id] = _doc
 +
 +                sizepage_doc = size_cache[size_id]
 +                links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a')
 +                for vid_a in links:
 +                    href = vid_a.get('href')
 +                    if not href.endswith(target):
 +                        continue
 +                    detail_q = href.partition('#')[0]
 +                    detail_url = url + '/' + detail_q
 +
 +                    m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q)
 +                    detail_id = m.group('detail_id')
 +
 +                    detail_html = self._download_webpage(
 +                        detail_url, movie,
 +                        note=u'Downloading detail %s %s' % (detail_id, size_id),
 +                        errnote=u'Error while downloading detail %s %s' % (detail_id, size_id)
 +                    )
 +                    detail_doc = xml.etree.ElementTree.fromstring(detail_html)
 +                    movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a')
 +                    assert movie_link_el.get('class') == 'movieLink'
 +                    movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h')
 +                    ext = determine_ext(movie_link)
 +                    assert ext == 'mov'
 +
 +                    formats.append({
 +                        'format': format_code,
 +                        'ext': ext,
 +                        'url': movie_link,
 +                    })
 +
 +            info = {
 +                '_type': 'video',
 +                'id': video_id,
 +                'title': title,
 +                'formats': formats,
 +                'title': title,
 +                'duration': duration,
 +                'thumbnail': thumbnail,
 +                'upload_date': upload_date,
 +                'uploader_id': uploader_id,
 +                'user_agent': 'QuickTime compatible (youtube-dl)',
 +            }
 +            # TODO: Remove when #980 has been merged
 +            info['url'] = formats[-1]['url']
 +            info['ext'] = formats[-1]['ext']
 +
 +            playlist.append(info)
 +
 +        return {
 +            '_type': 'playlist',
 +            'id': movie,
 +            'entries': playlist,
 +        }
index 52c4483c9ecafe914270444b848ba8ac5966f802,da50abfc1cd492b8d360ef601b44841a938c055b..12169b2bb9209dd901d280e02ce10349e1cad6b4
@@@ -47,8 -47,7 +47,8 @@@ class InfoExtractor(object)
      uploader_id:    Nickname or id of the video uploader.
      location:       Physical location of the video.
      player_url:     SWF Player URL (used for rtmpdump).
 -    subtitles:      The subtitle file contents.
 +    subtitles:      The subtitle file contents as a dictionary in the format
 +                    {language: subtitles}.
      view_count:     How many users have watched the video on the platform.
      urlhandle:      [internal] The urlHandle to be used to download the file,
                      like returned by urllib.request.urlopen
      @classmethod
      def suitable(cls, url):
          """Receives a URL and returns True if suitable for this IE."""
 -        return re.match(cls._VALID_URL, url) is not None
 +
 +        # This does not use has/getattr intentionally - we want to know whether
 +        # we have cached the regexp for *this* class, whereas getattr would also
 +        # match the superclass
 +        if '_VALID_URL_RE' not in cls.__dict__:
 +            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 +        return cls._VALID_URL_RE.match(url) is not None
  
      @classmethod
      def working(cls):
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
              if errnote is None:
                  errnote = u'Unable to download webpage'
--            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
++            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
  
      def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
          """ Returns a tuple (page content as string, URL handle) """
diff --combined youtube_dl/utils.py
index 52cfb8a6d18e69be0db213bbf3f4da63f1fcd7d9,59eeaf4a89084783e1ca2607840b3b7dfc4670f5..f78b5fe7889e6493ed8d87046dd7294fb3e3809f
@@@ -60,6 -60,6 +60,11 @@@ try
  except ImportError: # Python 2
      import httplib as compat_http_client
  
++try:
++    from http.error import HTTPError as compat_HTTPError
++except ImportError:  # Python 2
++    from urllib2 import HTTPError as compat_HTTPError
++
  try:
      from subprocess import DEVNULL
      compat_subprocess_get_DEVNULL = lambda: DEVNULL
@@@ -489,7 -489,7 +494,7 @@@ def make_HTTPS_handler(opts)
  
  class ExtractorError(Exception):
      """Error during info extraction."""
--    def __init__(self, msg, tb=None, expected=False):
++    def __init__(self, msg, tb=None, expected=False, cause=None):
          """ tb, if given, is the original traceback (so that it can be printed out).
          If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
          """
          if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
              expected = True
          if not expected:
 -            msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 +            msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
          super(ExtractorError, self).__init__(msg)
  
          self.traceback = tb
          self.exc_info = sys.exc_info()  # preserve original exception
++        self.cause = cause
  
      def format_traceback(self):
          if self.traceback is None:
@@@ -657,9 -657,6 +663,9 @@@ def determine_ext(url, default_ext=u'un
      else:
          return default_ext
  
 +def subtitles_filename(filename, sub_lang, sub_format):
 +    return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 +
  def date_from_str(date_str):
      """
      Return a datetime object from a string in the format YYYYMMDD or