d4f4ecc58afe4bac88878d5a6c41a0c85f533fc0
[youtube-dl] / youtube_dl / extractor / gogoanime.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     ExtractorError,
8     compat_urllib_parse,
9     get_element_by_attribute,
10     unescapeHTML
11 )
12
13
14 class GoGoAnimeIE(InfoExtractor):
15     IE_NAME = 'gogoanime'
16     IE_DESC = 'GoGoAnime'
17     _VALID_URL = r'http://www.gogoanime.com/(?P<id>[A-Za-z0-9-]+)'
18
19     _TEST = {
20         'url': 'http://www.gogoanime.com/mahou-shoujo-madoka-magica-movie-1',
21         'info_dict': {
22             'id': 'mahou-shoujo-madoka-magica-movie-1'
23         },
24         'playlist_count': 3
25     }
26
27     def _real_extract(self, url):
28         video_id = self._match_id(url)
29         page = self._download_webpage(url, video_id)
30
31         if 'Oops! Page Not Found</font>' in page:
32             raise ExtractorError('Video does not exist', expected=True)
33
34         content = get_element_by_attribute("class", "postcontent", page)
35         vids = re.findall(r'<iframe[^>]*?src=[\'"](h[^\'"]+)[\'"]', content)
36         vids = [
37             unescapeHTML(compat_urllib_parse.unquote(x))
38             for x in vids if not re.search(r".*videofun.*", x)]
39
40         if re.search(r'<div class="postcontent">[^<]*<p><iframe src=[\'"][^>]+></iframe><br />', page):
41             return self.playlist_result([self.url_result(vid) for vid in vids], video_id)
42
43         title = self._html_search_regex(
44             r'<div class="postdesc">[^<]*<h1>([^<]+)</h1>', page, 'title')
45
46         return {
47             '_type': 'url',
48             'id': video_id,
49             'url': vids[0],
50             'title': title,
51         }
52
53
54 class GoGoAnimeSearchIE(InfoExtractor):
55     IE_NAME = 'gogoanime:search'
56     IE_DESC = 'GoGoAnime Search'
57
58     _VALID_URL = r'http://www\.gogoanime\.com/.*\?s=(?P<id>[^&]*)'
59     _TEST = {
60         'url': 'http://www.gogoanime.com/?s=bokusatsu',
61         'info_dict': {
62             'id': 'bokusatsu'
63         },
64         'playlist_count': 6
65     }
66
67     def _real_extract(self, url):
68         playlist_id = self._match_id(url)
69         webpage = self._download_webpage(url, playlist_id)
70
71         posts = re.findall(
72             r'<div class="postlist">[^<]*<p[^>]*>[^<]*<a href="(?P<url>[^"]+)"',
73             webpage)
74
75         return self.playlist_result(
76             [self.url_result(p) for p in posts], playlist_id)