Merge pull request #9367 from codesparkle/master
authorYen Chi Hsuan <yan12125@gmail.com>
Thu, 5 May 2016 17:44:03 +0000 (01:44 +0800)
committerYen Chi Hsuan <yan12125@gmail.com>
Thu, 5 May 2016 17:44:03 +0000 (01:44 +0800)
Feature: --restrict-filenames: replace accented characters by their unaccented counterpart instead of "_"

15 files changed:
Makefile
README.md
youtube_dl/YoutubeDL.py
youtube_dl/extractor/aol.py
youtube_dl/extractor/common.py
youtube_dl/extractor/dailymail.py [new file with mode: 0644]
youtube_dl/extractor/extractors.py
youtube_dl/extractor/fczenit.py
youtube_dl/extractor/kuwo.py
youtube_dl/extractor/redtube.py
youtube_dl/extractor/udemy.py
youtube_dl/extractor/vevo.py
youtube_dl/extractor/xfileshare.py
youtube_dl/extractor/xiami.py
youtube_dl/extractor/yandexmusic.py

index 06cffcb710c6fd8fa6962007bd07d4753d5d5af6..c9ce216d1ad0b048d464238e56e0af2183c9ad46 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites
 
 clean:
-       rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe
+       rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi *.mkv *.webm CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe
        find . -name "*.pyc" -delete
        find . -name "*.class" -delete
 
index ecf7370477f93438ffe9511f878d073e9c2e7c61..50acb26a0175df2f3f11d6c8e42c1505cb14faa6 100644 (file)
--- a/README.md
+++ b/README.md
@@ -465,7 +465,7 @@ The basic usage is not to set any template arguments when downloading a single f
  - `display_id`: An alternative identifier for the video
  - `uploader`: Full name of the video uploader
  - `license`: License name the video is licensed under
- - `creator`: The main artist who created the video
+ - `creator`: The creator of the video
  - `release_date`: The date (YYYYMMDD) when the video was released
  - `timestamp`: UNIX timestamp of the moment the video became available
  - `upload_date`: Video upload date (YYYYMMDD)
index 0554333629b829a2a6cb807546a643713cbd0ad5..2187dcc8f6b850707e5d024fd94427cdc21fe536 100755 (executable)
@@ -580,7 +580,7 @@ class YoutubeDL(object):
                 is_id=(k == 'id'))
             template_dict = dict((k, sanitize(k, v))
                                  for k, v in template_dict.items()
-                                 if v is not None)
+                                 if v is not None and not isinstance(v, (list, tuple, dict)))
             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 
             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
@@ -1639,7 +1639,7 @@ class YoutubeDL(object):
                     # Just a single file
                     success = dl(filename, info_dict)
             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                self.report_error('unable to download video data: %s' % str(err))
+                self.report_error('unable to download video data: %s' % error_to_compat_str(err))
                 return
             except (OSError, IOError) as err:
                 raise UnavailableVideoError(err)
index 24df8fe9305e7df0487965ed03756305feca3dea..42c21bf41d975bcb49fd6c398b19ed2897cd3bd2 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 
 class AolIE(InfoExtractor):
     IE_NAME = 'on.aol.com'
-    _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/.*-)(?P<id>[^/?-]+)'
+    _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/(?:[^/]+/)*(?:[^/?#&]+-)?)(?P<id>[^/?#&]+)'
 
     _TESTS = [{
         # video with 5min ID
@@ -53,6 +53,12 @@ class AolIE(InfoExtractor):
     }, {
         'url': 'http://on.aol.com/shows/park-bench-shw518173474-559a1b9be4b0c3bfad3357a7?context=SH:SHW518173474:PL4327:1460619712763',
         'only_matching': True,
+    }, {
+        'url': 'http://on.aol.com/video/519442220',
+        'only_matching': True,
+    }, {
+        'url': 'aol-video:5707d6b8e4b090497b04f706',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index 61a5d124cd2a62d1abd2fc627507496a95794290..0843d89af71f7b68f6b650c01a3f8edcffdc78b3 100644 (file)
@@ -163,7 +163,7 @@ class InfoExtractor(object):
     description:    Full video description.
     uploader:       Full name of the video uploader.
     license:        License name the video is licensed under.
-    creator:        The main artist who created the video.
+    creator:        The creator of the video.
     release_date:   The date (YYYYMMDD) when the video was released.
     timestamp:      UNIX timestamp of the moment the video became available.
     upload_date:    Video upload date (YYYYMMDD).
diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py
new file mode 100644 (file)
index 0000000..b60a1d8
--- /dev/null
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    determine_protocol,
+)
+
+
+class DailyMailIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html',
+        'md5': '2f639d446394f53f3a33658b518b6615',
+        'info_dict': {
+            'id': '1288527',
+            'ext': 'mp4',
+            'title': 'Turn any video into an impressionist masterpiece',
+            'description': 'md5:88ddbcb504367987b2708bb38677c9d2',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        video_data = self._parse_json(self._search_regex(
+            r"data-opts='({.+?})'", webpage, 'video data'), video_id)
+        title = video_data['title']
+        video_sources = self._download_json(video_data.get(
+            'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id)
+
+        formats = []
+        for rendition in video_sources['renditions']:
+            rendition_url = rendition.get('url')
+            if not rendition_url:
+                continue
+            tbr = int_or_none(rendition.get('encodingRate'), 1000)
+            container = rendition.get('videoContainer')
+            is_hls = container == 'M2TS'
+            protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url})
+            formats.append({
+                'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''),
+                'url': rendition_url,
+                'width': int_or_none(rendition.get('frameWidth')),
+                'height': int_or_none(rendition.get('frameHeight')),
+                'tbr': tbr,
+                'vcodec': rendition.get('videoCodec'),
+                'container': container,
+                'protocol': protocol,
+                'ext': 'mp4' if is_hls else None,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('descr'),
+            'thumbnail': video_data.get('poster') or video_data.get('thumbnail'),
+            'formats': formats,
+        }
index ef4431364306c4ecbc1719778f073c2e97d00d2e..aac85066fd6de4839f0fa28cbc498c63decfe00f 100644 (file)
@@ -157,6 +157,7 @@ from .cspan import CSpanIE
 from .ctsnews import CtsNewsIE
 from .cultureunplugged import CultureUnpluggedIE
 from .cwtv import CWTVIE
+from .dailymail import DailyMailIE
 from .dailymotion import (
     DailymotionIE,
     DailymotionPlaylistIE,
index f1f150ef2ce41defbcee841d86fe4f9ada34d25d..8d1010b88c83dcbfd3e71e9f20275bf6fb9c9d21 100644 (file)
@@ -1,20 +1,19 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 
 
 class FczenitIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P<id>[0-9]+)'
     _TEST = {
-        'url': 'http://fc-zenit.ru/video/gl6785/',
-        'md5': '458bacc24549173fe5a5aa29174a5606',
+        'url': 'http://fc-zenit.ru/video/41044/',
+        'md5': '0e3fab421b455e970fa1aa3891e57df0',
         'info_dict': {
-            'id': '6785',
+            'id': '41044',
             'ext': 'mp4',
-            'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»',
+            'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»',
         },
     }
 
@@ -22,15 +21,23 @@ class FczenitIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, 'title')
+        video_title = self._html_search_regex(
+            r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title')
+
+        video_items = self._parse_json(self._search_regex(
+            r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'),
+            video_id)
 
-        bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL')
-        bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw)
+        def merge_dicts(*dicts):
+            ret = {}
+            for a_dict in dicts:
+                ret.update(a_dict)
+            return ret
 
         formats = [{
-            'url': furl,
-            'tbr': tbr,
-        } for furl, tbr in bitrates]
+            'url': compat_urlparse.urljoin(url, video_url),
+            'tbr': int(tbr),
+        } for tbr, video_url in merge_dicts(*video_items).items()]
 
         self._sort_formats(formats)
 
index 3740869c74cdc9ae878f5012c398a9bde7e7356b..11b31a69943e3252597a433b69e9ddeb471527fa 100644 (file)
@@ -283,6 +283,8 @@ class KuwoCategoryIE(InfoExtractor):
         category_desc = remove_start(
             get_element_by_id('intro', webpage).strip(),
             '%s简介:' % category_name)
+        if category_desc == '暂无':
+            category_desc = None
 
         jsonm = self._parse_json(self._html_search_regex(
             r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id)
index 7ba41ba593295cdc7d2e28e6b64702321ed1ef08..721fc3a9e2d2b3431051ea00982f72ae1d98ff65 100644 (file)
@@ -1,7 +1,12 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    str_to_int,
+    unified_strdate,
+)
 
 
 class RedTubeIE(InfoExtractor):
@@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor):
             'id': '66418',
             'ext': 'mp4',
             'title': 'Sucked on a toilet',
+            'upload_date': '20120831',
+            'duration': 596,
+            'view_count': int,
             'age_limit': 18,
         }
     }
@@ -24,12 +32,39 @@ class RedTubeIE(InfoExtractor):
         if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
             raise ExtractorError('Video %s has been removed' % video_id, expected=True)
 
-        video_url = self._html_search_regex(
-            r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
-        video_title = self._html_search_regex(
-            r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
-            webpage, 'title')
-        video_thumbnail = self._og_search_thumbnail(webpage)
+        title = self._html_search_regex(
+            (r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>',
+             r'videoTitle\s*:\s*(["\'])(?P<title>)\1'),
+            webpage, 'title', group='title')
+
+        formats = []
+        sources = self._parse_json(
+            self._search_regex(
+                r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'),
+            video_id, fatal=False)
+        if sources and isinstance(sources, dict):
+            for format_id, format_url in sources.items():
+                if format_url:
+                    formats.append({
+                        'url': format_url,
+                        'format_id': format_id,
+                        'height': int_or_none(format_id),
+                    })
+        else:
+            video_url = self._html_search_regex(
+                r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
+            formats.append({'url': video_url})
+        self._sort_formats(formats)
+
+        thumbnail = self._og_search_thumbnail(webpage)
+        upload_date = unified_strdate(self._search_regex(
+            r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<',
+            webpage, 'upload date', fatal=False))
+        duration = int_or_none(self._search_regex(
+            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+        view_count = str_to_int(self._search_regex(
+            r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)',
+            webpage, 'view count', fatal=False))
 
         # No self-labeling, but they describe themselves as
         # "Home of Videos Porno"
@@ -37,9 +72,12 @@ class RedTubeIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'url': video_url,
             'ext': 'mp4',
-            'title': video_title,
-            'thumbnail': video_thumbnail,
+            'title': title,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'view_count': view_count,
             'age_limit': age_limit,
+            'formats': formats,
         }
index d1e6f2703e022dac0edc3ef0f16794a6285d2b8f..13e0cd2374f9195f6de1675966ace21d0998b475 100644 (file)
@@ -5,7 +5,6 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_HTTPError,
-    compat_urllib_parse_urlencode,
     compat_urllib_request,
     compat_urlparse,
 )
@@ -84,18 +83,19 @@ class UdemyIE(InfoExtractor):
         if enroll_url:
             webpage = self._download_webpage(
                 combine_url(base_url, enroll_url),
-                course_id, 'Enrolling in the course')
+                course_id, 'Enrolling in the course',
+                headers={'Referer': base_url})
             if '>You have enrolled in' in webpage:
                 self.to_screen('%s: Successfully enrolled in the course' % course_id)
 
     def _download_lecture(self, course_id, lecture_id):
         return self._download_json(
-            'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % (
-                course_id, lecture_id, compat_urllib_parse_urlencode({
-                    'fields[lecture]': 'title,description,view_html,asset',
-                    'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data',
-                })),
-            lecture_id, 'Downloading lecture JSON')
+            'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?'
+            % (course_id, lecture_id),
+            lecture_id, 'Downloading lecture JSON', query={
+                'fields[lecture]': 'title,description,view_html,asset',
+                'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data',
+            })
 
     def _handle_error(self, response):
         if not isinstance(response, dict):
@@ -155,13 +155,13 @@ class UdemyIE(InfoExtractor):
             'password': password,
         })
 
-        request = sanitized_Request(
-            self._LOGIN_URL, urlencode_postdata(login_form))
-        request.add_header('Referer', self._ORIGIN_URL)
-        request.add_header('Origin', self._ORIGIN_URL)
-
         response = self._download_webpage(
-            request, None, 'Logging in as %s' % username)
+            self._LOGIN_URL, None, 'Logging in as %s' % username,
+            data=urlencode_postdata(login_form),
+            headers={
+                'Referer': self._ORIGIN_URL,
+                'Origin': self._ORIGIN_URL,
+            })
 
         if not is_logged(response):
             error = self._html_search_regex(
index 63eab414850ad0d6a681a3ae73e68875632a7a12..c0ef08c02b03457d44219edc8c3434e5d07666ee 100644 (file)
@@ -5,6 +5,7 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_etree_fromstring,
+    compat_str,
     compat_urlparse,
 )
 from ..utils import (
@@ -116,6 +117,10 @@ class VevoIE(VevoBaseIE):
             'genre': 'Pop',
         },
         'expected_warnings': ['Failed to download video versions info'],
+    }, {
+        # no genres available
+        'url': 'http://www.vevo.com/watch/INS171400764',
+        'only_matching': True,
     }]
     _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com'
     _SOURCE_TYPES = {
@@ -184,8 +189,8 @@ class VevoIE(VevoBaseIE):
             errnote='Unable to retrieve oauth token')
 
         if 'THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION' in webpage:
-            raise ExtractorError(
-                '%s said: This page is currently unavailable in your region.' % self.IE_NAME, expected=True)
+            self.raise_geo_restricted(
+                '%s said: This page is currently unavailable in your region' % self.IE_NAME)
 
         auth_info = self._parse_json(webpage, video_id)
         self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token']
@@ -200,12 +205,10 @@ class VevoIE(VevoBaseIE):
         response = self._download_json(
             json_url, video_id, 'Downloading video info', 'Unable to download info')
         video_info = response.get('video') or {}
-        video_versions = video_info.get('videoVersions')
         artist = None
         featured_artist = None
         uploader = None
         view_count = None
-        timestamp = None
         formats = []
 
         if not video_info:
@@ -339,7 +342,11 @@ class VevoIE(VevoBaseIE):
         if featured_artist:
             artist = '%s ft. %s' % (artist, featured_artist)
         title = '%s - %s' % (artist, track) if artist else track
-        genre = video_info.get('genres', [None])[0]
+
+        genres = video_info.get('genres')
+        genre = (
+            genres[0] if genres and isinstance(genres, list) and
+            isinstance(genres[0], compat_str) else None)
 
         is_explicit = video_info.get('isExplicit')
         if is_explicit is True:
index 2d1504eaacd36eb564da06eb541d2b6b8eabaa44..7690037350bbd8ad4187ea352194ba8008b2fd36 100644 (file)
@@ -13,12 +13,21 @@ from ..utils import (
 
 
 class XFileShareIE(InfoExtractor):
-    IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me'
-    _VALID_URL = r'''(?x)
-        https?://(?P<host>(?:www\.)?
-            (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me|powerwatch\.pw))/
-        (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
-    '''
+    _SITES = (
+        ('daclips.in', 'DaClips'),
+        ('filehoot.com', 'FileHoot'),
+        ('gorillavid.in', 'GorillaVid'),
+        ('movpod.in', 'MovPod'),
+        ('powerwatch.pw', 'PowerWatch'),
+        ('rapidvideo.ws', 'Rapidvideo.ws'),
+        ('thevideobee.to', 'TheVideoBee'),
+        ('vidto.me', 'Vidto'),
+        ('streamin.to', 'Streamin.To'),
+    )
+
+    IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
+    _VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
+                  % '|'.join(re.escape(site) for site in list(zip(*_SITES))[0]))
 
     _FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<'
 
@@ -43,25 +52,6 @@ class XFileShareIE(InfoExtractor):
             'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc',
             'thumbnail': 're:http://.*\.jpg',
         }
-    }, {
-        # video with countdown timeout
-        'url': 'http://fastvideo.in/1qmdn1lmsmbw',
-        'md5': '8b87ec3f6564a3108a0e8e66594842ba',
-        'info_dict': {
-            'id': '1qmdn1lmsmbw',
-            'ext': 'mp4',
-            'title': 'Man of Steel - Trailer',
-            'thumbnail': 're:http://.*\.jpg',
-        },
-    }, {
-        'url': 'http://realvid.net/ctn2y6p2eviw',
-        'md5': 'b2166d2cf192efd6b6d764c18fd3710e',
-        'info_dict': {
-            'id': 'ctn2y6p2eviw',
-            'ext': 'flv',
-            'title': 'rdx 1955',
-            'thumbnail': 're:http://.*\.jpg',
-        },
     }, {
         'url': 'http://movpod.in/0wguyyxi1yca',
         'only_matching': True,
index e4ed306b40bbcd8e8e398264cefc1bbd989f79b6..a6dfc4af98e6baafbe254579ebe060c208d6cd06 100644 (file)
@@ -9,6 +9,11 @@ from ..utils import int_or_none
 class XiamiBaseIE(InfoExtractor):
     _API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id'
 
+    def _download_webpage(self, *args, **kwargs):
+        webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs)
+        if '>Xiami is currently not available in your country.<' in webpage:
+            self.raise_geo_restricted('Xiami is currently not available in your country')
+
     def _extract_track(self, track, track_id=None):
         title = track['title']
         track_url = self._decrypt(track['location'])
@@ -81,7 +86,8 @@ class XiamiSongIE(XiamiBaseIE):
                     'ext': 'lrc',
                 }],
             },
-        }
+        },
+        'skip': 'Georestricted',
     }, {
         'url': 'http://www.xiami.com/song/1775256504',
         'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc',
@@ -100,7 +106,8 @@ class XiamiSongIE(XiamiBaseIE):
                     'ext': 'lrc',
                 }],
             },
-        }
+        },
+        'skip': 'Georestricted',
     }]
 
     def _real_extract(self, url):
@@ -124,6 +131,7 @@ class XiamiAlbumIE(XiamiPlaylistBaseIE):
             'id': '2100300444',
         },
         'playlist_count': 10,
+        'skip': 'Georestricted',
     }, {
         'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9',
         'only_matching': True,
@@ -141,6 +149,7 @@ class XiamiArtistIE(XiamiPlaylistBaseIE):
             'id': '2132',
         },
         'playlist_count': 20,
+        'skip': 'Georestricted',
     }
 
 
@@ -155,4 +164,5 @@ class XiamiCollectionIE(XiamiPlaylistBaseIE):
             'id': '156527391',
         },
         'playlist_mincount': 29,
+        'skip': 'Georestricted',
     }
index ce3723b55032915a216e23f4daa902a42ca314cf..0f78466e69a709f2726d9383ffde522039f17dc3 100644 (file)
@@ -10,8 +10,6 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     float_or_none,
-    sanitized_Request,
-    urlencode_postdata,
 )
 
 
@@ -177,7 +175,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
 class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
     IE_NAME = 'yandexmusic:playlist'
     IE_DESC = 'Яндекс.Музыка - Плейлист'
-    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)'
+    _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)'
 
     _TESTS = [{
         'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',
@@ -196,47 +194,64 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
             'id': '1036',
             'title': 'Музыка 90-х',
         },
-        'playlist_count': 310,
+        'playlist_mincount': 300,
         'skip': 'Travis CI servers blocked by YandexMusic',
     }]
 
     def _real_extract(self, url):
-        playlist_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, playlist_id)
-
-        mu = self._parse_json(
-            self._search_regex(
-                r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'),
-            playlist_id)
-
-        playlist = mu['pageData']['playlist']
-        tracks, track_ids = playlist['tracks'], playlist['trackIds']
-
-        # tracks dictionary shipped with webpage is limited to 150 tracks,
+        mobj = re.match(self._VALID_URL, url)
+        tld = mobj.group('tld')
+        user = mobj.group('user')
+        playlist_id = mobj.group('id')
+
+        playlist = self._download_json(
+            'https://music.yandex.%s/handlers/playlist.jsx' % tld,
+            playlist_id, 'Downloading missing tracks JSON',
+            fatal=False,
+            headers={
+                'Referer': url,
+                'X-Requested-With': 'XMLHttpRequest',
+                'X-Retpath-Y': url,
+            },
+            query={
+                'owner': user,
+                'kinds': playlist_id,
+                'light': 'true',
+                'lang': tld,
+                'external-domain': 'music.yandex.%s' % tld,
+                'overembed': 'false',
+            })['playlist']
+
+        tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds'])
+
+        # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
         # missing tracks should be retrieved manually.
         if len(tracks) < len(track_ids):
-            present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')])
-            missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids)
-            request = sanitized_Request(
-                'https://music.yandex.ru/handlers/track-entries.jsx',
-                urlencode_postdata({
+            present_track_ids = set([
+                compat_str(track['id'])
+                for track in tracks if track.get('id')])
+            missing_track_ids = [
+                track_id for track_id in track_ids
+                if track_id not in present_track_ids]
+            missing_tracks = self._download_json(
+                'https://music.yandex.%s/handlers/track-entries.jsx' % tld,
+                playlist_id, 'Downloading missing tracks JSON',
+                fatal=False,
+                headers={
+                    'Referer': url,
+                    'X-Requested-With': 'XMLHttpRequest',
+                },
+                query={
                     'entries': ','.join(missing_track_ids),
-                    'lang': mu.get('settings', {}).get('lang', 'en'),
-                    'external-domain': 'music.yandex.ru',
+                    'lang': tld,
+                    'external-domain': 'music.yandex.%s' % tld,
                     'overembed': 'false',
-                    'sign': mu.get('authData', {}).get('user', {}).get('sign'),
                     'strict': 'true',
-                }))
-            request.add_header('Referer', url)
-            request.add_header('X-Requested-With', 'XMLHttpRequest')
-
-            missing_tracks = self._download_json(
-                request, playlist_id, 'Downloading missing tracks JSON', fatal=False)
+                })
             if missing_tracks:
                 tracks.extend(missing_tracks)
 
         return self.playlist_result(
             self._build_playlist(tracks),
             compat_str(playlist_id),
-            playlist['title'], playlist.get('description'))
+            playlist.get('title'), playlist.get('description'))