[chirbit] fix profile downloader regex.
[youtube-dl] / youtube_dl / extractor / chirbit.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..utils import clean_html
8
9
10 class ChirbitIE(InfoExtractor):
11     _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?P<id>[^/]+)'
12     _TEST = {
13         'url': 'http://chirb.it/PrIPv5',
14         'md5': '9847b0dad6ac3e074568bf2cfb197de8',
15         'info_dict': {
16             'id': 'PrIPv5',
17             'display_id': 'kukushtv_1423231243',
18             'ext': 'mp3',
19             'title': 'Фасадстрой',
20             'url': 'http://audio.chirbit.com/kukushtv_1423231243.mp3'
21         }
22     }
23
24     def _real_extract(self, url):
25         audio_linkid = self._match_id(url)
26         webpage = self._download_webpage(url, audio_linkid)
27
28         audio_title = self._html_search_regex(r'<h2\s+itemprop="name">(.*?)</h2>', webpage, 'title')
29         audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID')
30         audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3';
31
32         return {
33             'id': audio_linkid,
34             'display_id': audio_id,
35             'title': audio_title,
36             'url': audio_url
37         }
38
39 class ChirbitProfileIE(InfoExtractor):
40     _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P<id>[^/]+)/?$'
41     _TEST = {
42         'url': 'http://chirbit.com/ScarletBeauty',
43         'playlist_count': 3,
44         'info_dict': {
45             '_type': 'playlist',
46             'title': 'ScarletBeauty',
47             'id': 'ScarletBeauty'
48         }
49     }
50
51     def _real_extract(self, url):
52         profile_id = self._match_id(url)
53
54         # Chirbit has a pretty weird "Last Page" navigation behavior.
55         # We grab the profile's oldest entry to determine when to
56         # stop fetching entries.
57         oldestpage = self._download_webpage(url + '/24599', profile_id)
58         oldest_page_entries = re.findall(
59             r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''',
60             oldestpage);
61         oldestentry = clean_html(oldest_page_entries[-1]);
62
63         ids = []
64         titles = []
65         n = 0
66         while True:
67             page = self._download_webpage(url + '/' + str(n), profile_id)
68             page_ids = re.findall(
69                 r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''',
70                 page);
71             page_titles = re.findall(
72                 r'''<div\s+class="chirbit_title"\s*>(.*?)</div>''',
73                 page);
74             ids += page_ids
75             titles += page_titles
76             if oldestentry in page_ids:
77                 break
78             n += 1
79
80         entries = []
81         i = 0
82         for id in ids:
83             entries.append({
84                 'id': id,
85                 'title': titles[i],
86                 'url': 'http://audio.chirbit.com/' + id + '.mp3'
87             });
88             i += 1
89
90         info_dict = {
91             '_type': 'playlist',
92             'id': profile_id,
93             'title': profile_id,
94             'entries': entries
95         }
96
97         return info_dict;