[xiami] Add xiami extractor
authorBlahGeek <i@BlahGeek.com>
Sat, 30 Apr 2016 13:32:54 +0000 (21:32 +0800)
committerSergey M․ <dstftw@gmail.com>
Sat, 30 Apr 2016 15:48:40 +0000 (21:48 +0600)
youtube_dl/extractor/extractors.py
youtube_dl/extractor/xiami.py [new file with mode: 0644]

index b1b7f9b42894d3d6487fea9d03508282358ab4db..14ca9eaeeefe4ecee8234829e06851db958ce53c 100644 (file)
@@ -941,6 +941,12 @@ from .xhamster import (
     XHamsterIE,
     XHamsterEmbedIE,
 )
+from .xiami import (
+    XiamiIE,
+    XiamiAlbumIE,
+    XiamiArtistIE,
+    XiamiCollectionIE
+)
 from .xminus import XMinusIE
 from .xnxx import XNXXIE
 from .xstream import XstreamIE
diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py
new file mode 100644 (file)
index 0000000..a28d63c
--- /dev/null
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    xpath_element,
+    xpath_text,
+    xpath_with_ns,
+    int_or_none,
+    ExtractorError
+)
+from ..compat import compat_urllib_parse_unquote
+
+
+class XiamiBaseIE(InfoExtractor):
+
+    _XML_BASE_URL = 'http://www.xiami.com/song/playlist/id'
+    _NS_MAP = {'xm': 'http://xspf.org/ns/0/'}
+
+    def _extract_track(self, track):
+        artist = xpath_text(track, xpath_with_ns('xm:artist', self._NS_MAP), default='')
+        artist = artist.split(';')
+
+        ret = {
+            'id': xpath_text(track, xpath_with_ns('xm:song_id', self._NS_MAP)),
+            'title': xpath_text(track, xpath_with_ns('xm:title', self._NS_MAP)),
+            'album': xpath_text(track, xpath_with_ns('xm:album_name', self._NS_MAP)),
+            'artist': ';'.join(artist) if artist else None,
+            'creator': artist[0] if artist else None,
+            'url': self._decrypt(xpath_text(track, xpath_with_ns('xm:location', self._NS_MAP))),
+            'thumbnail': xpath_text(track, xpath_with_ns('xm:pic', self._NS_MAP), default=None),
+            'duration': int_or_none(xpath_text(track, xpath_with_ns('xm:length', self._NS_MAP))),
+        }
+
+        lyrics_url = xpath_text(track, xpath_with_ns('xm:lyric', self._NS_MAP))
+        if lyrics_url and lyrics_url.endswith('.lrc'):
+            ret['description'] = self._download_webpage(lyrics_url, ret['id'])
+        return ret
+
+    def _extract_xml(self, _id, typ=''):
+        playlist = self._download_xml('%s/%s%s' % (self._XML_BASE_URL, _id, typ), _id)
+        tracklist = xpath_element(playlist, xpath_with_ns('./xm:trackList', self._NS_MAP))
+
+        if not len(tracklist):
+            raise ExtractorError('No track found')
+        return [self._extract_track(track) for track in tracklist]
+
+    @staticmethod
+    def _decrypt(origin):
+        n = int(origin[0])
+        origin = origin[1:]
+        short_lenth = len(origin) // n
+        long_num = len(origin) - short_lenth * n
+        l = tuple()
+        for i in range(0, n):
+            length = short_lenth
+            if i < long_num:
+                length += 1
+            l += (origin[0:length], )
+            origin = origin[length:]
+        ans = ''
+        for i in range(0, short_lenth + 1):
+            for j in range(0, n):
+                if len(l[j])>i:
+                    ans += l[j][i]
+        return compat_urllib_parse_unquote(ans).replace('^', '0')
+
+
+class XiamiIE(XiamiBaseIE):
+    IE_NAME = 'xiami:song'
+    IE_DESC = '虾米音乐'
+    _VALID_URL = r'http://www\.xiami\.com/song/(?P<id>[0-9]+)'
+    _TESTS = [
+        {
+            'url': 'http://www.xiami.com/song/1775610518',
+            'md5': '521dd6bea40fd5c9c69f913c232cb57e',
+            'info_dict': {
+                'id': '1775610518',
+                'ext': 'mp3',
+                'title': 'Woman',
+                'creator': 'HONNE',
+                'album': 'Woman',
+                'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
+                'description': 'md5:052ec7de41ca19f67e7fd70a1bfc4e0b',
+            }
+        },
+        {
+            'url': 'http://www.xiami.com/song/1775256504',
+            'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc',
+            'info_dict': {
+                'id': '1775256504',
+                'ext': 'mp3',
+                'title': '悟空',
+                'creator': '戴荃',
+                'album': '悟空',
+                'description': 'md5:206e67e84f9bed1d473d04196a00b990',
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+        _id = self._match_id(url)
+        return self._extract_xml(_id)[0]
+
+
+class XiamiAlbumIE(XiamiBaseIE):
+    IE_NAME = 'xiami:album'
+    IE_DESC = '虾米音乐 - 专辑'
+    _VALID_URL = r'http://www\.xiami\.com/album/(?P<id>[0-9]+)'
+    _TESTS = [
+        {
+            'url': 'http://www.xiami.com/album/2100300444',
+            'info_dict': {
+                'id': '2100300444',
+            },
+            'playlist_count': 10,
+        },
+        {
+            'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9',
+            'only_matching': True,
+        }
+    ]
+
+    def _real_extract(self, url):
+        _id = self._match_id(url)
+        return self.playlist_result(self._extract_xml(_id, '/type/1'), _id)
+
+
+class XiamiArtistIE(XiamiBaseIE):
+    IE_NAME = 'xiami:artist'
+    IE_DESC = '虾米音乐 - 歌手'
+    _VALID_URL = r'http://www\.xiami\.com/artist/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp',
+        'info_dict': {
+            'id': '2132',
+        },
+        'playlist_count': 20,
+    }
+
+    def _real_extract(self, url):
+        _id = self._match_id(url)
+        return self.playlist_result(self._extract_xml(_id, '/type/2'), _id)
+
+
+class XiamiCollectionIE(XiamiBaseIE):
+    IE_NAME = 'xiami:collection'
+    IE_DESC = '虾米音乐 - 精选集'
+    _VALID_URL = r'http://www\.xiami\.com/collect/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr',
+        'info_dict': {
+            'id': '156527391',
+        },
+        'playlist_count': 26,
+    }
+
+    def _real_extract(self, url):
+        _id = self._match_id(url)
+        return self.playlist_result(self._extract_xml(_id, '/type/3'), _id)