X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcrunchyroll.py;h=bc2d1fa8b041e3ec1bbc4d6d1b5f055ac31ee140;hb=HEAD;hp=fd1e7afad298a6df57ad86a77e31116c9133b71b;hpb=a8f83f0c56e81b871a46c18fa9ebc6643370fa48;p=youtube-dl diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index fd1e7afad..bc2d1fa8b 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re import json -import xml.etree.ElementTree as etree import zlib from hashlib import sha1 @@ -12,7 +11,9 @@ from .common import InfoExtractor from .vrv import VRVIE from ..compat import ( compat_b64decode, + compat_etree_Element, compat_etree_fromstring, + compat_str, compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, @@ -25,9 +26,9 @@ from ..utils import ( intlist_to_bytes, int_or_none, lowercase_escape, + merge_dicts, remove_end, sanitized_Request, - unified_strdate, urlencode_postdata, xpath_text, ) @@ -103,19 +104,6 @@ class CrunchyrollBaseIE(InfoExtractor): def _real_initialize(self): self._login() - def _download_webpage(self, url_or_request, *args, **kwargs): - request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) - else sanitized_Request(url_or_request)) - # Accept-Language must be set explicitly to accept any language to avoid issues - # similar to https://github.com/rg3/youtube-dl/issues/6797. - # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction - # should be imposed or not (from what I can see it just takes the first language - # ignoring the priority and requires it to correspond the IP). By the way this causes - # Crunchyroll to not work in georestriction cases in some browsers that don't place - # the locale lang first in header. However allowing any language seems to workaround the issue. - request.add_header('Accept-Language', '*') - return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) - @staticmethod def _add_skip_wall(url): parsed_url = compat_urlparse.urlparse(url) @@ -124,7 +112,7 @@ class CrunchyrollBaseIE(InfoExtractor): # > This content may be inappropriate for some people. # > Are you sure you want to continue? # since it's not disabled by default in crunchyroll account's settings. - # See https://github.com/rg3/youtube-dl/issues/7202. + # See https://github.com/ytdl-org/youtube-dl/issues/7202. qs['skip_wall'] = ['1'] return compat_urlparse.urlunparse( parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) @@ -149,6 +137,7 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): # rtmp 'skip_download': True, }, + 'skip': 'Video gone', }, { 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1', 'info_dict': { @@ -170,11 +159,12 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): 'info_dict': { 'id': '702409', 'ext': 'mp4', - 'title': 'Re:ZERO -Starting Life in Another World- Episode 5 – The Morning of Our Promise Is Still Distant', - 'description': 'md5:97664de1ab24bbf77a9c01918cb7dca9', + 'title': compat_str, + 'description': compat_str, 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'TV TOKYO', - 'upload_date': '20160508', + 'uploader': 'Re:Zero Partners', + 'timestamp': 1462098900, + 'upload_date': '20160501', }, 'params': { # m3u8 download @@ -185,12 +175,13 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): 'info_dict': { 'id': '727589', 'ext': 'mp4', - 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!", - 'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d', + 'title': compat_str, + 'description': compat_str, 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Kadokawa Pictures Inc.', - 'upload_date': '20170118', - 'series': "KONOSUBA -God's blessing on this wonderful world!", + 'timestamp': 1484130900, + 'upload_date': '20170111', + 'series': compat_str, 'season': "KONOSUBA -God's blessing on this wonderful world! 2", 'season_number': 2, 'episode': 'Give Me Deliverance From This Judicial Injustice!', @@ -213,10 +204,11 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): 'info_dict': { 'id': '535080', 'ext': 'mp4', - 'title': '11eyes Episode 1 – Red Night ~ Piros éjszaka', - 'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".', + 'title': compat_str, + 'description': compat_str, 'uploader': 'Marvelous AQL Inc.', - 'upload_date': '20091021', + 'timestamp': 1255512600, + 'upload_date': '20091014', }, 'params': { # Just test metadata extraction @@ -237,15 +229,17 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): # just test metadata extraction 'skip_download': True, }, + 'skip': 'Video gone', }, { # A video with a vastly different season name compared to the series name 'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532', 'info_dict': { 'id': '590532', 'ext': 'mp4', - 'title': 'Haiyoru! Nyaruani (ONA) Episode 1 – Test', - 'description': 'Mahiro and Nyaruko talk about official certification.', + 'title': compat_str, + 'description': compat_str, 'uploader': 'TV TOKYO', + 'timestamp': 1330956000, 'upload_date': '20120305', 'series': 'Nyarko-san: Another Crawling Chaos', 'season': 'Haiyoru! Nyaruani (ONA)', @@ -269,6 +263,19 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): '1080': ('80', '108'), } + def _download_webpage(self, url_or_request, *args, **kwargs): + request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) + else sanitized_Request(url_or_request)) + # Accept-Language must be set explicitly to accept any language to avoid issues + # similar to https://github.com/ytdl-org/youtube-dl/issues/6797. + # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction + # should be imposed or not (from what I can see it just takes the first language + # ignoring the priority and requires it to correspond the IP). By the way this causes + # Crunchyroll to not work in georestriction cases in some browsers that don't place + # the locale lang first in header. However allowing any language seems to workaround the issue. + request.add_header('Accept-Language', '*') + return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) + def _decrypt_subtitles(self, data, iv, id): data = bytes_to_intlist(compat_b64decode(data)) iv = bytes_to_intlist(compat_b64decode(iv)) @@ -390,7 +397,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'Downloading subtitles for ' + sub_name, data={ 'subtitle_script_id': sub_id, }) - if not isinstance(sub_doc, etree.Element): + if not isinstance(sub_doc, compat_etree_Element): continue sid = sub_doc.get('id') iv = xpath_text(sub_doc, 'iv', 'subtitle iv') @@ -442,23 +449,21 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage, 'language', default=None, group='lang') video_title = self._html_search_regex( - r'(?s)]*>((?:(?!]+itemprop=["\']title["\'][^>]*>(?:(?!', - webpage, 'video_title') + (r'(?s)]*>((?:(?!]+itemprop=["\']title["\']|meta[^>]+itemprop=["\']position["\'])[^>]*>(?:(?!', + r'(.+?),\s+-\s+.+? Crunchyroll'), + webpage, 'video_title', default=None) + if not video_title: + video_title = re.sub(r'^Watch\s+', '', self._og_search_description(webpage)) video_title = re.sub(r' {2,}', ' ', video_title) video_description = (self._parse_json(self._html_search_regex( r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, webpage, 'description', default='{}'), video_id) or media_metadata).get('description') if video_description: video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) - video_upload_date = self._html_search_regex( - [r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'], - webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) - if video_upload_date: - video_upload_date = unified_strdate(video_upload_date) video_uploader = self._html_search_regex( # try looking for both an uploader that's a link and one that's not [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], - webpage, 'video_uploader', fatal=False) + webpage, 'video_uploader', default=False) formats = [] for stream in media.get('streams', []): @@ -507,7 +512,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'video_quality': stream_quality, 'current_page': url, }) - if isinstance(streamdata, etree.Element): + if isinstance(streamdata, compat_etree_Element): stream_info = streamdata.find('./{default}preload/stream_info') if stream_info is not None: stream_infos.append(stream_info) @@ -518,7 +523,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'video_format': stream_format, 'video_encode_quality': stream_quality, }) - if isinstance(stream_info, etree.Element): + if isinstance(stream_info, compat_etree_Element): stream_infos.append(stream_info) for stream_info in stream_infos: video_encode_id = xpath_text(stream_info, './video_encode_id') @@ -593,7 +598,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text season = episode = episode_number = duration = thumbnail = None - if isinstance(metadata, etree.Element): + if isinstance(metadata, compat_etree_Element): season = xpath_text(metadata, 'series_title') episode = xpath_text(metadata, 'episode_title') episode_number = int_or_none(xpath_text(metadata, 'episode_number')) @@ -611,14 +616,15 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', webpage, 'season number', default=None)) - return { + info = self._search_json_ld(webpage, video_id, default={}) + + return merge_dicts({ 'id': video_id, 'title': video_title, 'description': video_description, 'duration': duration, 'thumbnail': thumbnail, 'uploader': video_uploader, - 'upload_date': video_upload_date, 'series': series, 'season': season, 'season_number': season_number, @@ -626,7 +632,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'episode_number': episode_number, 'subtitles': subtitles, 'formats': formats, - } + }, info) class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): @@ -661,9 +667,8 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): webpage = self._download_webpage( self._add_skip_wall(url), show_id, headers=self.geo_verification_headers()) - title = self._html_search_regex( - r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>', - webpage, 'title') + title = self._html_search_meta('name', webpage, default=None) + episode_paths = re.findall( r'(?s)<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"', webpage)