Merge pull request #5588 from aajanki/encode_frag_filenames

[youtube-dl] / youtube_dl / extractor / crunchyroll.py
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py

index 41f0c736d98c229518bacb41fac2f35ce9b80958..c2162aa68987f70db7576abeca2228be4460dd1d 100644 (file)
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -12,12 +12,15 @@ from math import pow, sqrt, floor
  from .common import InfoExtractor
  from ..compat import (
      compat_urllib_parse,
+    compat_urllib_parse_unquote,
      compat_urllib_request,
+    compat_urlparse,
  )
  from ..utils import (
      ExtractorError,
      bytes_to_intlist,
      intlist_to_bytes,
+    remove_end,
      unified_strdate,
      urlencode_postdata,
  )
@@ -27,7 +30,7 @@ from ..aes import (
  
  
  class CrunchyrollIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
      _NETRC_MACHINE = 'crunchyroll'
      _TESTS = [{
          'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
@@ -45,6 +48,22 @@ class CrunchyrollIE(InfoExtractor):
              # rtmp
              'skip_download': True,
          },
+    }, {
+        'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1',
+        'info_dict': {
+            'id': '589804',
+            'ext': 'flv',
+            'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11',
+            'description': 'md5:fe2743efedb49d279552926d0bd0cd9e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'Danny Choo Network',
+            'upload_date': '20120213',
+        },
+        'params': {
+            # rtmp
+            'skip_download': True,
+        },
+
      }, {
          'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
          'only_matching': True,
@@ -218,7 +237,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
              webpage_url = 'http://www.' + mobj.group('url')
  
          webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage')
-        note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='')
+        note_m = self._html_search_regex(
+            r'<div class="showmedia-trailer-notice">(.+?)</div>',
+            webpage, 'trailer-notice', default='')
          if note_m:
              raise ExtractorError(note_m)
  
@@ -228,6 +249,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
              if msg.get('type') == 'error':
                  raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
  
+        if 'To view this, please log in to verify you are 18 or older.' in webpage:
+            self.raise_login_required()
+
          video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
          video_title = re.sub(r' {2,}', ' ', video_title)
          video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
@@ -238,7 +262,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
              video_upload_date = unified_strdate(video_upload_date)
          video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL)
  
-        playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
+        playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
          playerdata_req = compat_urllib_request.Request(playerdata_url)
          playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
          playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -251,16 +275,31 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
          for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):
              stream_quality, stream_format = self._FORMAT_IDS[fmt]
              video_format = fmt + 'p'
-            streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/')
-            # urlencode doesn't work!
-            streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality=' + stream_quality + '&media%5Fid=' + stream_id + '&video%5Fformat=' + stream_format
+            streamdata_req = compat_urllib_request.Request(
+                'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s'
+                % (stream_id, stream_format, stream_quality),
+                compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8'))
              streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
-            streamdata_req.add_header('Content-Length', str(len(streamdata_req.data)))
              streamdata = self._download_xml(
                  streamdata_req, video_id,
                  note='Downloading media info for %s' % video_format)
-            video_url = streamdata.find('./host').text
-            video_play_path = streamdata.find('./file').text
+            stream_info = streamdata.find('./{default}preload/stream_info')
+            video_url = stream_info.find('./host').text
+            video_play_path = stream_info.find('./file').text
+
+            if '.fplive.net/' in video_url:
+                video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
+                parsed_video_url = compat_urlparse.urlparse(video_url)
+                direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
+                    netloc='v.lvlt.crcdn.net',
+                    path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1])))
+                if self._is_valid_url(direct_video_url, video_id, video_format):
+                    formats.append({
+                        'url': direct_video_url,
+                        'format_id': video_format,
+                    })
+                    continue
+
              formats.append({
                  'url': video_url,
                  'play_path': video_play_path,