_ Git - youtube-dl/blob - youtube_dl/extractor/internetvideoarchive.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     compat_urlparse,
   8     compat_urllib_parse,
   9     xpath_with_ns,
  10 )
  11
  12
  13 class InternetVideoArchiveIE(InfoExtractor):
  14     _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?'
  15
  16     _TEST = {
  17         'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
  18         'info_dict': {
  19             'id': '452693',
  20             'ext': 'mp4',
  21             'title': 'SKYFALL',
  22             'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
  23             'duration': 149,
  24         },
  25     }
  26
  27     @staticmethod
  28     def _build_url(query):
  29         return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query
  30
  31     @staticmethod
  32     def _clean_query(query):
  33         NEEDED_ARGS = ['publishedid', 'customerid']
  34         query_dic = compat_urlparse.parse_qs(query)
  35         cleaned_dic = dict((k, v[0]) for (k, v) in query_dic.items() if k in NEEDED_ARGS)
  36         # Other player ids return m3u8 urls
  37         cleaned_dic['playerid'] = '247'
  38         cleaned_dic['videokbrate'] = '100000'
  39         return compat_urllib_parse.urlencode(cleaned_dic)
  40
  41     def _real_extract(self, url):
  42         query = compat_urlparse.urlparse(url).query
  43         query_dic = compat_urlparse.parse_qs(query)
  44         video_id = query_dic['publishedid'][0]
  45         url = self._build_url(query)
  46
  47         flashconfiguration = self._download_xml(url, video_id,
  48             'Downloading flash configuration')
  49         file_url = flashconfiguration.find('file').text
  50         file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
  51         # Replace some of the parameters in the query to get the best quality
  52         # and http links (no m3u8 manifests)
  53         file_url = re.sub(r'(?<=\?)(.+)$',
  54             lambda m: self._clean_query(m.group()),
  55             file_url)
  56         info = self._download_xml(file_url, video_id,
  57             'Downloading video info')
  58         item = info.find('channel/item')
  59
  60         def _bp(p):
  61             return xpath_with_ns(p,
  62                 {'media': 'http://search.yahoo.com/mrss/',
  63                 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'})
  64         formats = []
  65         for content in item.findall(_bp('media:group/media:content')):
  66             attr = content.attrib
  67             f_url = attr['url']
  68             width = int(attr['width'])
  69             bitrate = int(attr['bitrate'])
  70             format_id = '%d-%dk' % (width, bitrate)
  71             formats.append({
  72                 'format_id': format_id,
  73                 'url': f_url,
  74                 'width': width,
  75                 'tbr': bitrate,
  76             })
  77
  78         self._sort_formats(formats)
  79
  80         return {
  81             'id': video_id,
  82             'title': item.find('title').text,
  83             'formats': formats,
  84             'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'],
  85             'description': item.find('description').text,
  86             'duration': int(attr['duration']),
  87         }