_ Git - youtube-dl/blob - youtube_dl/extractor/wrzuta.py

   1 # -*- coding: utf-8 -*-
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..utils import (
   8     int_or_none,
   9     qualities,
  10 )
  11
  12
  13 class WrzutaIE(InfoExtractor):
  14     IE_NAME = 'wrzuta.pl'
  15
  16     _VALID_URL = r'https?://(?P<uploader>[0-9a-zA-Z]+)\.wrzuta\.pl/(?P<typ>film|audio)/(?P<id>[0-9a-zA-Z]+)'
  17
  18     _TESTS = [{
  19         'url': 'http://laboratoriumdextera.wrzuta.pl/film/aq4hIZWrkBu/nike_football_the_last_game',
  20         'md5': '9e67e05bed7c03b82488d87233a9efe7',
  21         'info_dict': {
  22             'id': 'aq4hIZWrkBu',
  23             'ext': 'mp4',
  24             'title': 'Nike Football: The Last Game',
  25             'duration': 307,
  26             'uploader_id': 'laboratoriumdextera',
  27             'description': 'md5:7fb5ef3c21c5893375fda51d9b15d9cd',
  28         },
  29     }, {
  30         'url': 'http://jolka85.wrzuta.pl/audio/063jOPX5ue2/liber_natalia_szroeder_-_teraz_ty',
  31         'md5': 'bc78077859bea7bcfe4295d7d7fc9025',
  32         'info_dict': {
  33             'id': '063jOPX5ue2',
  34             'ext': 'ogg',
  35             'title': 'Liber & Natalia Szroeder - Teraz Ty',
  36             'duration': 203,
  37             'uploader_id': 'jolka85',
  38             'description': 'md5:2d2b6340f9188c8c4cd891580e481096',
  39         },
  40     }]
  41
  42     def _real_extract(self, url):
  43         mobj = re.match(self._VALID_URL, url)
  44         video_id = mobj.group('id')
  45         typ = mobj.group('typ')
  46         uploader = mobj.group('uploader')
  47
  48         webpage = self._download_webpage(url, video_id)
  49
  50         quality = qualities(['SD', 'MQ', 'HQ', 'HD'])
  51
  52         audio_table = {'flv': 'mp3', 'webm': 'ogg', '???': 'mp3'}
  53
  54         embedpage = self._download_json('http://www.wrzuta.pl/npp/embed/%s/%s' % (uploader, video_id), video_id)
  55
  56         formats = []
  57         for media in embedpage['url']:
  58             fmt = media['type'].split('@')[0]
  59             if typ == 'audio':
  60                 ext = audio_table.get(fmt, fmt)
  61             else:
  62                 ext = fmt
  63
  64             formats.append({
  65                 'format_id': '%s_%s' % (ext, media['quality'].lower()),
  66                 'url': media['url'],
  67                 'ext': ext,
  68                 'quality': quality(media['quality']),
  69             })
  70
  71         self._sort_formats(formats)
  72
  73         return {
  74             'id': video_id,
  75             'title': self._og_search_title(webpage),
  76             'thumbnail': self._og_search_thumbnail(webpage),
  77             'formats': formats,
  78             'duration': int_or_none(embedpage['duration']),
  79             'uploader_id': uploader,
  80             'description': self._og_search_description(webpage),
  81             'age_limit': embedpage.get('minimalAge', 0),
  82         }
  83
  84
  85 _ENTRY_PATTERN = r'<a href="(?P<playlist_entry_url>[^"]+)" target="_blank" class="playlist\-file\-page">'
  86 _PLAYLIST_SIZE_PATTERN = r'<div class="playlist-counter">[0-9]+/([0-9]+)</div>'
  87
  88
  89 class WrzutaPlaylistIE(InfoExtractor):
  90     """
  91         this class covers extraction of wrzuta playlist entries
  92         the extraction process bases on following steps:
  93         * collect information of playlist size
  94         * download all entries provided on
  95           the playlist webpage (the playlist is split
  96           on two pages: first directly reached from webpage
  97           second: downloaded on demand by ajax call and rendered
  98           using the ajax call response)
  99         * in case size of extracted entries not reached total number of entries
 100           use the ajax call to collect the remaining entries
 101     """
 102
 103     IE_NAME = 'wrzuta.pl:playlist'
 104
 105     _VALID_URL = r'https?://(?P<uploader>[0-9a-zA-Z]+)\.wrzuta\.pl/playlista/' \
 106                  '(?P<id>[0-9a-zA-Z]+)/.*'
 107
 108     _TESTS = [{
 109         'url': 'http://miromak71.wrzuta.pl/playlista/7XfO4vE84iR/moja_muza',
 110         'playlist_mincount': 14,
 111         'info_dict': {
 112             'id': '7XfO4vE84iR',
 113             'title': 'Moja muza',
 114         },
 115     }, {
 116         'url': 'http://heroesf70.wrzuta.pl/playlista/6Nj3wQHx756/lipiec_-_lato_2015_muzyka_swiata',
 117         'playlist_mincount': 144,
 118         'info_dict': {
 119             'id': '6Nj3wQHx756',
 120             'title': 'Lipiec - Lato 2015 Muzyka Świata',
 121         },
 122     }]
 123
 124     def _real_extract(self, url):
 125         mobj = re.match(self._VALID_URL, url)
 126         playlist_id = mobj.group('id')
 127         uploader = mobj.group('uploader')
 128
 129         entries = []
 130
 131         webpage = self._download_webpage(url, playlist_id)
 132
 133         playlist_size = self._html_search_regex(_PLAYLIST_SIZE_PATTERN, webpage, 'Size of the playlist')
 134         playlist_size = int(playlist_size) if playlist_size else 0
 135
 136         playlist_title = self._og_search_title(webpage).replace('Playlista: ', '', 1)
 137
 138         if playlist_size:
 139             entries = list(map(
 140                 lambda entry_url: self.url_result(entry_url),
 141                 re.findall(_ENTRY_PATTERN, webpage)
 142             ))
 143
 144             if playlist_size > len(entries):
 145                 playlist_content = self._download_json(
 146                     'http://{uploader_id}.wrzuta.pl/xhr/get_playlist_offset/{playlist_id}'.format(
 147                         uploader_id=uploader,
 148                         playlist_id=playlist_id,
 149                     ),
 150                     playlist_id,
 151                     'Downloading playlist content as JSON metadata',
 152                     'Unable to download playlist content as JSON metadata',
 153                 )
 154                 entries += [self.url_result(entry['filelink']) for entry in playlist_content['files']]
 155
 156         return self.playlist_result(entries, playlist_id, playlist_title)