X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=07bd2cbe2f9f6c4213e31e4c6e90d2df7b611f58;hb=12557339453e25dbb18dfc51dc1e88ca5325d8e9;hp=f411ea7633568915e4e97df61958d13c6a8aca80;hpb=611c1dd96efc36a788475e14cc4de64d554d28a0;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f411ea763..07bd2cbe2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,13 +15,14 @@ import math from ..compat import ( compat_cookiejar, compat_cookies, + compat_etree_fromstring, compat_getpass, compat_http_client, + compat_os_name, + compat_str, compat_urllib_error, compat_urllib_parse, compat_urlparse, - compat_str, - compat_etree_fromstring, ) from ..utils import ( NO_DEFAULT, @@ -46,6 +47,7 @@ from ..utils import ( xpath_with_ns, determine_protocol, parse_duration, + mimetype2ext, ) @@ -156,12 +158,14 @@ class InfoExtractor(object): thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. + license: License name the video is licensed under. creator: The main artist who created the video. release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. + uploader_url: Full URL to a personal webpage of the video uploader. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {language: subformats}. "subformats" is a list sorted from @@ -424,7 +428,7 @@ class InfoExtractor(object): self.to_screen('Saving request to ' + filename) # Working around MAX_PATH limitation on Windows (see # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if os.name == 'nt': + if compat_os_name == 'nt': absfilepath = os.path.abspath(filename) if len(absfilepath) > 259: filename = '\\\\?\\' + absfilepath @@ -593,7 +597,7 @@ class InfoExtractor(object): if mobj: break - if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty(): + if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): _name = '\033[0;34m%s\033[0m' % name else: _name = name @@ -899,6 +903,16 @@ class InfoExtractor(object): item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), formats) + @staticmethod + def _remove_duplicate_formats(formats): + format_urls = set() + unique_formats = [] + for f in formats: + if f['url'] not in format_urls: + format_urls.add(f['url']) + unique_formats.append(f) + formats[:] = unique_formats + def _is_valid_url(self, url, video_id, item='video'): url = self._proto_relative_url(url, scheme='http:') # For now assume non HTTP(S) URLs always valid @@ -1022,11 +1036,21 @@ class InfoExtractor(object): return [] m3u8_doc, urlh = res m3u8_url = urlh.geturl() - # A Media Playlist Tag MUST NOT appear in a Master Playlist - # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 - # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists - # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 - if '#EXT-X-TARGETDURATION' in m3u8_doc: + + # We should try extracting formats only from master playlists [1], i.e. + # playlists that describe available qualities. On the other hand media + # playlists [2] should be returned as is since they contain just the media + # without qualities renditions. + # Fortunately, master playlist can be easily distinguished from media + # playlist based on particular tags availability. As of [1, 2] master + # playlist tags MUST NOT appear in a media playist and vice versa. + # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist + # and MUST NOT appear in master playlist thus we can clearly detect media + # playlist with this criterion. + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4 + # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 + # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 + if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is return [{ 'url': m3u8_url, 'format_id': m3u8_id, @@ -1073,19 +1097,29 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, } - codecs = last_info.get('CODECS') - if codecs: - # TODO: looks like video codec is not always necessarily goes first - va_codecs = codecs.split(',') - if va_codecs[0]: - f['vcodec'] = va_codecs[0] - if len(va_codecs) > 1 and va_codecs[1]: - f['acodec'] = va_codecs[1] resolution = last_info.get('RESOLUTION') if resolution: width_str, height_str = resolution.split('x') f['width'] = int(width_str) f['height'] = int(height_str) + codecs = last_info.get('CODECS') + if codecs: + vcodec, acodec = [None] * 2 + va_codecs = codecs.split(',') + if len(va_codecs) == 1: + # Audio only entries usually come with single codec and + # no resolution. For more robustness we also check it to + # be mp4 audio. + if not resolution and va_codecs[0].startswith('mp4a'): + vcodec, acodec = 'none', va_codecs[0] + else: + vcodec = va_codecs[0] + else: + vcodec, acodec = va_codecs[:2] + f.update({ + 'acodec': acodec, + 'vcodec': vcodec, + }) if last_media is not None: f['m3u8_media'] = last_media last_media = None @@ -1277,16 +1311,7 @@ class InfoExtractor(object): if not src or src in urls: continue urls.append(src) - ext = textstream.get('ext') or determine_ext(src) - if not ext: - type_ = textstream.get('type') - SUBTITLES_TYPES = { - 'text/vtt': 'vtt', - 'text/srt': 'srt', - 'application/smptett+xml': 'tt', - } - if type_ in SUBTITLES_TYPES: - ext = SUBTITLES_TYPES[type_] + ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type')) lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, @@ -1598,6 +1623,15 @@ class InfoExtractor(object): def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def mark_watched(self, *args, **kwargs): + if (self._downloader.params.get('mark_watched', False) and + (self._get_login_info()[0] is not None or + self._downloader.params.get('cookiefile') is not None)): + self._mark_watched(*args, **kwargs) + + def _mark_watched(self, *args, **kwargs): + raise NotImplementedError('This method must be implemented by subclasses') + class SearchInfoExtractor(InfoExtractor): """