Merge remote-tracking branch 'rzhxeo/crunchyroll'
[youtube-dl] / youtube_dl / extractor / yahoo.py
1 import itertools
2 import json
3 import re
4
5 from .common import InfoExtractor, SearchInfoExtractor
6 from ..utils import (
7     compat_urllib_parse,
8     compat_urlparse,
9     determine_ext,
10     clean_html,
11 )
12
13
14 class YahooIE(InfoExtractor):
15     IE_DESC = u'Yahoo screen'
16     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
17     _TESTS = [
18         {
19             u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
20             u'file': u'214727115.mp4',
21             u'md5': u'4962b075c08be8690a922ee026d05e69',
22             u'info_dict': {
23                 u'title': u'Julian Smith & Travis Legg Watch Julian Smith',
24                 u'description': u'Julian and Travis watch Julian Smith',
25             },
26         },
27         {
28             u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
29             u'file': u'103000935.mp4',
30             u'md5': u'd6e6fc6e1313c608f316ddad7b82b306',
31             u'info_dict': {
32                 u'title': u'Codefellas - The Cougar Lies with Spanish Moss',
33                 u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
34             },
35         },
36     ]
37
38     def _real_extract(self, url):
39         mobj = re.match(self._VALID_URL, url)
40         video_id = mobj.group('id')
41         webpage = self._download_webpage(url, video_id)
42
43         items_json = self._search_regex(r'mediaItems: ({.*?})$',
44             webpage, u'items', flags=re.MULTILINE)
45         items = json.loads(items_json)
46         info = items['mediaItems']['query']['results']['mediaObj'][0]
47         # The 'meta' field is not always in the video webpage, we request it
48         # from another page
49         long_id = info['id']
50         return self._get_info(long_id, video_id)
51
52     def _get_info(self, long_id, video_id):
53         query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
54                  ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
55                  ' AND protocol="http"' % long_id)
56         data = compat_urllib_parse.urlencode({
57             'q': query,
58             'env': 'prod',
59             'format': 'json',
60         })
61         query_result_json = self._download_webpage(
62             'http://video.query.yahoo.com/v1/public/yql?' + data,
63             video_id, u'Downloading video info')
64         query_result = json.loads(query_result_json)
65         info = query_result['query']['results']['mediaObj'][0]
66         meta = info['meta']
67
68         formats = []
69         for s in info['streams']:
70             format_info = {
71                 'width': s.get('width'),
72                 'height': s.get('height'),
73                 'bitrate': s.get('bitrate'),
74             }
75
76             host = s['host']
77             path = s['path']
78             if host.startswith('rtmp'):
79                 format_info.update({
80                     'url': host,
81                     'play_path': path,
82                     'ext': 'flv',
83                 })
84             else:
85                 format_url = compat_urlparse.urljoin(host, path)
86                 format_info['url'] = format_url
87                 format_info['ext'] = determine_ext(format_url)
88                 
89             formats.append(format_info)
90         formats = sorted(formats, key=lambda f:(f['height'], f['width']))
91
92         return {
93             'id': video_id,
94             'title': meta['title'],
95             'formats': formats,
96             'description': clean_html(meta['description']),
97             'thumbnail': meta['thumbnail'],
98         }
99
100
101 class YahooNewsIE(YahooIE):
102     IE_NAME = 'yahoo:news'
103     _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
104
105     _TEST = {
106         u'url': u'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
107         u'md5': u'67010fdf3a08d290e060a4dd96baa07b',
108         u'info_dict': {
109             u'id': u'104538833',
110             u'ext': u'mp4',
111             u'title': u'China Moses Is Crazy About the Blues',
112             u'description': u'md5:9900ab8cd5808175c7b3fe55b979bed0',
113         },
114     }
115
116     # Overwrite YahooIE properties we don't want
117     _TESTS = []
118
119     def _real_extract(self, url):
120         mobj = re.match(self._VALID_URL, url)
121         video_id = mobj.group('id')
122         webpage = self._download_webpage(url, video_id)
123         long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, u'long id')
124         return self._get_info(long_id, video_id)
125
126
127 class YahooSearchIE(SearchInfoExtractor):
128     IE_DESC = u'Yahoo screen search'
129     _MAX_RESULTS = 1000
130     IE_NAME = u'screen.yahoo:search'
131     _SEARCH_KEY = 'yvsearch'
132
133     def _get_n_results(self, query, n):
134         """Get a specified number of results for a query"""
135
136         res = {
137             '_type': 'playlist',
138             'id': query,
139             'entries': []
140         }
141         for pagenum in itertools.count(0): 
142             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
143             webpage = self._download_webpage(result_url, query,
144                                              note='Downloading results page '+str(pagenum+1))
145             info = json.loads(webpage)
146             m = info[u'm']
147             results = info[u'results']
148
149             for (i, r) in enumerate(results):
150                 if (pagenum * 30) +i >= n:
151                     break
152                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
153                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
154                 res['entries'].append(e)
155             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1)):
156                 break
157
158         return res