_ Git - youtube-dl/blob - youtube_dl/extractor/jukebox.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     RegexNotFoundError,
   9     unescapeHTML,
  10 )
  11
  12
  13 class JukeboxIE(InfoExtractor):
  14     _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html'
  15     _TEST = {
  16         'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html',
  17         'md5': '1574e9b4d6438446d5b7dbcdf2786276',
  18         'info_dict': {
  19             'id': 'r303r',
  20             'ext': 'flv',
  21             'title': 'Kosheen-En Vivo Pride',
  22             'uploader': 'Kosheen',
  23         },
  24     }
  25
  26     def _real_extract(self, url):
  27         mobj = re.match(self._VALID_URL, url)
  28         video_id = mobj.group('video_id')
  29
  30         html = self._download_webpage(url, video_id)
  31         iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url'))
  32
  33         iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe')
  34         if re.search(r'class="jkb_waiting"', iframe_html) is not None:
  35             raise ExtractorError('Video is not available(in your country?)!')
  36
  37         self.report_extraction(video_id)
  38
  39         try:
  40             video_url = self._search_regex(r'"config":{"file":"(?P<video_url>http:[^"]+\?mdtk=[0-9]+)"',
  41                 iframe_html, 'video url')
  42             video_url = unescapeHTML(video_url).replace('\/', '/')
  43         except RegexNotFoundError:
  44             youtube_url = self._search_regex(
  45                 r'config":{"file":"(http:\\/\\/www\.youtube\.com\\/watch\?v=[^"]+)"',
  46                 iframe_html, 'youtube url')
  47             youtube_url = unescapeHTML(youtube_url).replace('\/', '/')
  48             self.to_screen('Youtube video detected')
  49             return self.url_result(youtube_url, ie='Youtube')
  50
  51         title = self._html_search_regex(r'<h1 class="inline">([^<]+)</h1>',
  52             html, 'title')
  53         artist = self._html_search_regex(r'<span id="infos_article_artist">([^<]+)</span>',
  54             html, 'artist')
  55
  56         return {
  57             'id': video_id,
  58             'url': video_url,
  59             'title': artist + '-' + title,
  60             'uploader': artist,
  61         }