Merge remote-tracking branch 'alab1001101/master'
authorPhilipp Hagemeister <phihag@phihag.de>
Tue, 27 Nov 2012 16:14:29 +0000 (17:14 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Tue, 27 Nov 2012 16:14:29 +0000 (17:14 +0100)
1  2 
youtube_dl/InfoExtractors.py
youtube_dl/__init__.py

index 13b04ab5bcce4ee1e57e46afab0b198f1a477991,82459e7a80868467eb15d3a2f840d297666495c0..cea30dad81fa4224a848732159aa19684c7d5dbc
@@@ -1140,6 -1077,161 +1140,143 @@@ class VimeoIE(InfoExtractor)
                }]
  
  
 -        """arte.tv information extractor."""
 -
 -        _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
 -        _LIVE_URL = r'index-[0-9]+\.html$'
 -
 -        IE_NAME = u'arte.tv'
 -
 -        def __init__(self, downloader=None):
 -                InfoExtractor.__init__(self, downloader)
 -
 -        def report_download_webpage(self, video_id):
 -                """Report webpage download."""
 -                self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
 -
 -        def report_extraction(self, video_id):
 -                """Report information extraction."""
 -                self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
 -
 -        def fetch_webpage(self, url):
 -                self._downloader.increment_downloads()
 -                request = urllib2.Request(url)
 -                try:
 -                        self.report_download_webpage(url)
 -                        webpage = urllib2.urlopen(request).read()
 -                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 -                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 -                        return
 -                except ValueError, err:
 -                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 -                        return
 -                return webpage
 -
 -        def grep_webpage(self, url, regex, regexFlags, matchTuples):
 -                page = self.fetch_webpage(url)
 -                mobj = re.search(regex, page, regexFlags)
 -                info = {}
 -
 -                if mobj is None:
 -                    self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 -                    return
 -
 -                for (i, key, err) in matchTuples:
 -                    if mobj.group(i) is None:
 -                        self._downloader.trouble(err)
 -                        return
 -                    else:
 -                        info[key] = mobj.group(i)
 -
 -                return info
 -
 -        def extractLiveStream(self, url):
 -
 -                video_lang = url.split('/')[-4]
 -
 -                info = self.grep_webpage(
 -                    url,
 -                    r'src="(.*?/videothek_js.*?\.js)',
 -                    0,
 -                    [
 -                        (1, 'url', u'ERROR: Invalid URL: %s' % url)
 -                    ]
 -                )
 -
 -                http_host = url.split('/')[2]
 -                next_url = 'http://%s%s' % (http_host, urllib.unquote(info.get('url')))
 -
 -                info = self.grep_webpage(
 -                    next_url,
 -                    r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
 -                     '(http://.*?\.swf).*?' +
 -                     '(rtmp://.*?)\'',
 -                    re.DOTALL,
 -                    [
 -                        (1, 'path',   u'ERROR: could not extract video path: %s' % url),
 -                        (2, 'player', u'ERROR: could not extract video player: %s' % url),
 -                        (3, 'url',    u'ERROR: could not extract video url: %s' % url)
 -                    ]
 -                )
 -
 -                video_url = u'%s/%s' % (info.get('url'), info.get('path'))
 -
 -                print u'rtmpdump --swfVfy \'%s\' --rtmp \'%s\' --live -o arte-live.mp4' % (info.get('player'), video_url)
 -
 -        def extractPlus7Stream(self, url):
 -
 -                video_lang = url.split('/')[-3]
 -
 -                info = self.grep_webpage(
 -                    url,
 -                    r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
 -                    0,
 -                    [
 -                        (1, 'url', u'ERROR: Invalid URL: %s' % url)
 -                    ]
 -                )
 -
 -                next_url = urllib.unquote(info.get('url'))
 -
 -                info = self.grep_webpage(
 -                    next_url,
 -                    r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
 -                    0,
 -                    [
 -                        (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
 -                    ]
 -                )
 -
 -                next_url = urllib.unquote(info.get('url'))
 -
 -                info = self.grep_webpage(
 -                    next_url,
 -                    r'<video id="(.*?)".*?>.*?' +
 -                     '<name>(.*?)</name>.*?' +
 -                     '<dateVideo>(.*?)</dateVideo>.*?' +
 -                     '<url quality="hd">(.*?)</url>',
 -                    re.DOTALL,
 -                    [
 -                        (1, 'id',    u'ERROR: could not extract video id: %s' % url),
 -                        (2, 'title', u'ERROR: could not extract video title: %s' % url),
 -                        (3, 'date',  u'ERROR: could not extract video date: %s' % url),
 -                        (4, 'url',   u'ERROR: could not extract video url: %s' % url)
 -                    ]
 -                )
 -
 -                return {
 -                    'id':           info.get('id'),
 -                    'url':          urllib.unquote(info.get('url')),
 -                    'uploader':     u'arte.tv',
 -                    'upload_date':  info.get('date'),
 -                    'title':        info.get('title'),
 -                    'ext':          u'mp4',
 -                    'format':       u'NA',
 -                    'player_url':   None,
 -                }
 -
 -        def _real_extract(self, url):
 -
 -                video_id = url.split('/')[-1]
 -
 -                self.report_extraction(video_id)
 -
 -                if re.search(self._LIVE_URL, video_id) is not None:
 -                    self.extractLiveStream(url)
 -                    return
 -                else:
 -                    info = self.extractPlus7Stream(url)
 -
 -                try:
 -                        # Process video information
 -                        self._downloader.process_info(info)
 -                except UnavailableVideoError, err:
 -                        self._downloader.trouble(u'\nERROR: unable to download video')
+ class ArteTvIE(InfoExtractor):
++      """arte.tv information extractor."""
++
++      _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
++      _LIVE_URL = r'index-[0-9]+\.html$'
++
++      IE_NAME = u'arte.tv'
++
++      def __init__(self, downloader=None):
++              InfoExtractor.__init__(self, downloader)
++
++      def report_download_webpage(self, video_id):
++              """Report webpage download."""
++              self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
++
++      def report_extraction(self, video_id):
++              """Report information extraction."""
++              self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
++
++      def fetch_webpage(self, url):
++              self._downloader.increment_downloads()
++              request = urllib2.Request(url)
++              try:
++                      self.report_download_webpage(url)
++                      webpage = urllib2.urlopen(request).read()
++              except (urllib2.URLError, httplib.HTTPException, socket.error), err:
++                      self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
++                      return
++              except ValueError, err:
++                      self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
++                      return
++              return webpage
++
++      def grep_webpage(self, url, regex, regexFlags, matchTuples):
++              page = self.fetch_webpage(url)
++              mobj = re.search(regex, page, regexFlags)
++              info = {}
++
++              if mobj is None:
++                      self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
++                      return
++
++              for (i, key, err) in matchTuples:
++                      if mobj.group(i) is None:
++                              self._downloader.trouble(err)
++                              return
++                      else:
++                              info[key] = mobj.group(i)
++
++              return info
++
++      def extractLiveStream(self, url):
++              video_lang = url.split('/')[-4]
++              info = self.grep_webpage(
++                      url,
++                      r'src="(.*?/videothek_js.*?\.js)',
++                      0,
++                      [
++                              (1, 'url', u'ERROR: Invalid URL: %s' % url)
++                      ]
++              )
++              http_host = url.split('/')[2]
++              next_url = 'http://%s%s' % (http_host, urllib.unquote(info.get('url')))
++              info = self.grep_webpage(
++                      next_url,
++                      r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
++                              '(http://.*?\.swf).*?' +
++                              '(rtmp://.*?)\'',
++                      re.DOTALL,
++                      [
++                              (1, 'path',   u'ERROR: could not extract video path: %s' % url),
++                              (2, 'player', u'ERROR: could not extract video player: %s' % url),
++                              (3, 'url',    u'ERROR: could not extract video url: %s' % url)
++                      ]
++              )
++              video_url = u'%s/%s' % (info.get('url'), info.get('path'))
++
++      def extractPlus7Stream(self, url):
++              video_lang = url.split('/')[-3]
++              info = self.grep_webpage(
++                      url,
++                      r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
++                      0,
++                      [
++                              (1, 'url', u'ERROR: Invalid URL: %s' % url)
++                      ]
++              )
++              next_url = urllib.unquote(info.get('url'))
++              info = self.grep_webpage(
++                      next_url,
++                      r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
++                      0,
++                      [
++                              (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
++                      ]
++              )
++              next_url = urllib.unquote(info.get('url'))
++
++              info = self.grep_webpage(
++                      next_url,
++                      r'<video id="(.*?)".*?>.*?' +
++                              '<name>(.*?)</name>.*?' +
++                              '<dateVideo>(.*?)</dateVideo>.*?' +
++                              '<url quality="hd">(.*?)</url>',
++                      re.DOTALL,
++                      [
++                              (1, 'id',    u'ERROR: could not extract video id: %s' % url),
++                              (2, 'title', u'ERROR: could not extract video title: %s' % url),
++                              (3, 'date',  u'ERROR: could not extract video date: %s' % url),
++                              (4, 'url',   u'ERROR: could not extract video url: %s' % url)
++                      ]
++              )
++
++              return {
++                      'id':           info.get('id'),
++                      'url':          urllib.unquote(info.get('url')),
++                      'uploader':     u'arte.tv',
++                      'upload_date':  info.get('date'),
++                      'title':        info.get('title'),
++                      'ext':          u'mp4',
++                      'format':       u'NA',
++                      'player_url':   None,
++              }
++
++      def _real_extract(self, url):
++              video_id = url.split('/')[-1]
++              self.report_extraction(video_id)
++
++              if re.search(self._LIVE_URL, video_id) is not None:
++                      self.extractLiveStream(url)
++                      return
++              else:
++                      info = self.extractPlus7Stream(url)
++
++              return [info]
  class GenericIE(InfoExtractor):
        """Generic last-resort information extractor."""
  
index 92478aa6bb71d2b3a92127eb9e1877ec4eb813dd,13cf77896e52cf0a546d3663b44fb5d635923997..f7a49e13a8f85a1c4fcb5030078e554416f1df85
@@@ -364,10 -351,8 +364,10 @@@ def gen_extractors()
                MixcloudIE(),
                StanfordOpenClassroomIE(),
                MTVIE(),
 -                ArteTvIE(),
 -
 +              YoukuIE(),
 +              XNXXIE(),
 +              GooglePlusIE(),
++              ArteTvIE(),
                GenericIE()
        ]