Merge pull request #1693 from alexvh/teamcoco_fix
[youtube-dl] / youtube_dl / extractor / teamcoco.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5     ExtractorError,
6     RegexNotFoundError,
7 )
8
9
10 class TeamcocoIE(InfoExtractor):
11     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
12     _TEST = {
13         u'url': u'http://teamcoco.com/video/louis-ck-interview-george-w-bush',
14         u'file': u'19705.mp4',
15         u'md5': u'cde9ba0fa3506f5f017ce11ead928f9a',
16         u'info_dict': {
17             u"description": u"Louis C.K. got starstruck by George W. Bush, so what? Part one.", 
18             u"title": u"Louis C.K. Interview Pt. 1 11/3/11"
19         }
20     }
21
22     def _real_extract(self, url):
23         mobj = re.match(self._VALID_URL, url)
24         if mobj is None:
25             raise ExtractorError(u'Invalid URL: %s' % url)
26         url_title = mobj.group('url_title')
27         webpage = self._download_webpage(url, url_title)
28
29         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
30             webpage, u'video id')
31
32         self.report_extraction(video_id)
33
34         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
35         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
36
37
38         qualities = [ '1080p', '720p', '1000k', '480p', '500k' ]
39         best_quality_idx = len(qualities)+1  # First regex match may not be optimal
40         for idx, quality in enumerate(qualities):
41             regex = r'<file [^>]*type="(?:high|standard)".*?>(.*%s.*)</file>' % quality
42             try:
43                 url = self._html_search_regex(regex, data, u'video URL')
44                 if idx < best_quality_idx:
45                     video_url = url
46                     best_quality_idx = idx
47             except RegexNotFoundError:
48                 # Just catch fatal exc. Don't want the fatal=False warning
49                 continue
50         if not video_url:
51             raise RegexNotFoundError(u'Unable to extract video URL')
52
53         return [{
54             'id':          video_id,
55             'url':         video_url,
56             'ext':         'mp4',
57             'title':       self._og_search_title(webpage),
58             'thumbnail':   self._og_search_thumbnail(webpage),
59             'description': self._og_search_description(webpage),
60         }]