[bigflix] Extract all formats
[youtube-dl] / youtube_dl / extractor / bigflix.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import re
6
7 from .common import InfoExtractor
8 from ..compat import compat_urllib_parse_unquote
9
10
11 class BigflixIE(InfoExtractor):
12     _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P<id>[0-9]+)'
13     _TESTS = [{
14         'url': 'http://www.bigflix.com/Hindi-movies/Action-movies/Singham-Returns/16537',
15         'md5': 'ec76aa9b1129e2e5b301a474e54fab74',
16         'info_dict': {
17             'id': '16537',
18             'ext': 'mp4',
19             'title': 'Singham Returns',
20             'description': 'md5:3d2ba5815f14911d5cc6a501ae0cf65d',
21         }
22     }, {
23         # multiple formats
24         'url': 'http://www.bigflix.com/Tamil-movies/Drama-movies/Madarasapatinam/16070',
25         'info_dict': {
26             'id': '16070',
27             'ext': 'mp4',
28             'title': 'Madarasapatinam',
29             'description': 'md5:63b9b8ed79189c6f0418c26d9a3452ca',
30             'formats': 'mincount:2',
31         },
32         'params': {
33             'skip_download': True,
34         }
35     }]
36
37     def _real_extract(self, url):
38         video_id = self._match_id(url)
39
40         webpage = self._download_webpage(url, video_id)
41
42         title = self._html_search_regex(
43             r'<div[^>]+class=["\']pagetitle["\'][^>]*>(.+?)</div>',
44             webpage, 'title')
45
46         def decode_url(quoted_b64_url):
47             return base64.b64decode(compat_urllib_parse_unquote(
48                 quoted_b64_url)).encode('ascii').decode('utf-8')
49
50         formats = [{
51             'url': decode_url(encoded_url),
52             'format_id': '%sp' % height,
53             'height': int(height),
54         } for height, encoded_url in re.findall(
55             r'ContentURL_(\d{3,4})[pP][^=]+=([^&]+)', webpage)]
56
57         if not formats:
58             formats.append({
59                 'url': decode_url(self._search_regex(
60                     r'file=([^&]+)', webpage, 'video url')),
61             })
62
63         description = self._html_search_meta('description', webpage)
64
65         return {
66             'id': video_id,
67             'title': title,
68             'description': description,
69             'formats': formats
70         }