From: Sergey M. Date: Wed, 22 Jul 2015 17:42:19 +0000 (+0600) Subject: Merge pull request #6292 from atomicdryad/pr-fix_pbs_titles X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=9872e588c855e9c8cf456ddc4514e6e127f56322;hp=0eacd2aaae7d4150d9cf4e1dd8ffc2ebc7ab030b;p=youtube-dl Merge pull request #6292 from atomicdryad/pr-fix_pbs_titles pbs: fix vague 'Full Episode' titles; prepend name of show --- diff --git a/AUTHORS b/AUTHORS index 4fd65f46f..373e05c9f 100644 --- a/AUTHORS +++ b/AUTHORS @@ -133,3 +133,4 @@ Remita Amine Aurélio A. Heckert Bernhard Minks sceext +Zach Bruggeman diff --git a/README.md b/README.md index a2cc89cdb..ac54d7b67 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ which means you can modify it, redistribute it or use it however you like. ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) --playlist-end NUMBER Playlist video to end at (default is last) - --playlist-items ITEM_SPEC Playlist video items to download. Specify indices of the videos in the playlist seperated by commas like: "--playlist-items 1,2,5,8" + --playlist-items ITEM_SPEC Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13. --match-title REGEX Download only matching titles (regex or caseless sub-string) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a84878026..73445137f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -28,6 +28,7 @@ - **anitube.se** - **AnySex** - **Aparat** + - **AppleConnect** - **AppleDaily**: 臺灣蘋果日報 - **AppleTrailers** - **archive.org**: archive.org videos @@ -486,6 +487,7 @@ - **SportBox** - **SportBoxEmbed** - **SportDeutschland** + - **Sportschau** - **Srf** - **SRMediathek**: Saarländischer Rundfunk - **SSA** @@ -611,8 +613,8 @@ - **Vimple**: Vimple - one-click video hosting - **Vine** - **vine:user** - - **vk.com** - - **vk.com:user-videos**: vk.com:All of a user's videos + - **vk**: VK + - **vk:uservideos**: VK - User's Videos - **Vodlocker** - **VoiceRepublic** - **Vporn** diff --git a/test/test_utils.py b/test/test_utils.py index e13e11b59..65692a9fb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -324,6 +324,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('02:03:04'), 7384) self.assertEqual(parse_duration('01:02:03:04'), 93784) self.assertEqual(parse_duration('1 hour 3 minutes'), 3780) + self.assertEqual(parse_duration('87 Min.'), 5220) def test_fix_xml_ampersands(self): self.assertEqual( diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 00af78e06..702a6ad50 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1104,7 +1104,8 @@ class YoutubeDL(object): if req_format is None: req_format_list = [] if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and - info_dict['extractor'] in ['youtube', 'ted']): + info_dict['extractor'] in ['youtube', 'ted'] and + not info_dict.get('is_live')): merger = FFmpegMergerPP(self) if merger.available and merger.can_merge(): req_format_list.append('bestvideo+bestaudio') diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index f110830c4..dccc59212 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -8,6 +8,7 @@ from .hls import NativeHlsFD from .http import HttpFD from .rtsp import RtspFD from .rtmp import RtmpFD +from .dash import DashSegmentsFD from ..utils import ( determine_protocol, @@ -20,6 +21,7 @@ PROTOCOL_MAP = { 'mms': RtspFD, 'rtsp': RtspFD, 'f4m': F4mFD, + 'http_dash_segments': DashSegmentsFD, } diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py new file mode 100644 index 000000000..a4685d307 --- /dev/null +++ b/youtube_dl/downloader/dash.py @@ -0,0 +1,66 @@ +from __future__ import unicode_literals + +import re + +from .common import FileDownloader +from ..compat import compat_urllib_request + + +class DashSegmentsFD(FileDownloader): + """ + Download segments in a DASH manifest + """ + def real_download(self, filename, info_dict): + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + base_url = info_dict['url'] + segment_urls = info_dict['segment_urls'] + + is_test = self.params.get('test', False) + remaining_bytes = self._TEST_FILE_SIZE if is_test else None + byte_counter = 0 + + def append_url_to_file(outf, target_url, target_name, remaining_bytes=None): + self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name)) + req = compat_urllib_request.Request(target_url) + if remaining_bytes is not None: + req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) + + data = self.ydl.urlopen(req).read() + + if remaining_bytes is not None: + data = data[:remaining_bytes] + + outf.write(data) + return len(data) + + def combine_url(base_url, target_url): + if re.match(r'^https?://', target_url): + return target_url + return '%s/%s' % (base_url, target_url) + + with open(tmpfilename, 'wb') as outf: + append_url_to_file( + outf, combine_url(base_url, info_dict['initialization_url']), + 'initialization segment') + for i, segment_url in enumerate(segment_urls): + segment_len = append_url_to_file( + outf, combine_url(base_url, segment_url), + 'segment %d / %d' % (i + 1, len(segment_urls)), + remaining_bytes) + byte_counter += segment_len + if remaining_bytes is not None: + remaining_bytes -= segment_len + if remaining_bytes <= 0: + break + + self.try_rename(tmpfilename, filename) + + self._hook_progress({ + 'downloaded_bytes': byte_counter, + 'total_bytes': byte_counter, + 'filename': filename, + 'status': 'finished', + }) + + return True diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5033d67ed..3cfa804ec 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -22,7 +22,11 @@ from .aparat import AparatIE from .appleconnect import AppleConnectIE from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE -from .ard import ARDIE, ARDMediathekIE +from .ard import ( + ARDIE, + ARDMediathekIE, + SportschauIE, +) from .arte import ( ArteTvIE, ArteTVPlus7IE, @@ -271,6 +275,7 @@ from .kuwo import ( ) from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .lecture2go import Lecture2GoIE from .letv import ( LetvIE, LetvTvIE, @@ -553,7 +558,6 @@ from .sportbox import ( SportBoxEmbedIE, ) from .sportdeutschland import SportDeutschlandIE -from .sportschau import SportschauIE from .srf import SrfIE from .srmediathek import SRMediathekIE from .ssa import SSAIE diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6a35ea463..6f465789b 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,6 +8,7 @@ from .generic import GenericIE from ..utils import ( determine_ext, ExtractorError, + get_element_by_attribute, qualities, int_or_none, parse_duration, @@ -22,19 +23,125 @@ class ARDMediathekIE(InfoExtractor): _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ - 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', - 'only_matching': True, + 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114', + 'info_dict': { + 'id': '29582122', + 'ext': 'mp4', + 'title': 'Ich liebe das Leben trotzdem', + 'description': 'md5:45e4c225c72b27993314b31a84a5261c', + 'duration': 4557, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { - 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916', + 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916', + 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', 'info_dict': { - 'id': '22490580', + 'id': '29522730', 'ext': 'mp4', - 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)', - 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.', + 'title': 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)', + 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1', + 'duration': 5252, }, - 'skip': 'Blocked outside of Germany', + }, { + # audio + 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', + 'md5': '219d94d8980b4f538c7fcb0865eb7f2c', + 'info_dict': { + 'id': '28488308', + 'ext': 'mp3', + 'title': 'Tod eines Fußballers', + 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', + 'duration': 3240, + }, + }, { + 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', + 'only_matching': True, }] + def _extract_media_info(self, media_info_url, webpage, video_id): + media_info = self._download_json( + media_info_url, video_id, 'Downloading media JSON') + + formats = self._extract_formats(media_info, video_id) + + if not formats: + if '"fsk"' in webpage: + raise ExtractorError( + 'This video is only available after 20:00', expected=True) + elif media_info.get('_geoblocked'): + raise ExtractorError('This video is not available due to geo restriction', expected=True) + + self._sort_formats(formats) + + duration = int_or_none(media_info.get('_duration')) + thumbnail = media_info.get('_previewImage') + + subtitles = {} + subtitle_url = media_info.get('_subtitleUrl') + if subtitle_url: + subtitles['de'] = [{ + 'ext': 'srt', + 'url': subtitle_url, + }] + + return { + 'id': video_id, + 'duration': duration, + 'thumbnail': thumbnail, + 'formats': formats, + 'subtitles': subtitles, + } + + def _extract_formats(self, media_info, video_id): + type_ = media_info.get('_type') + media_array = media_info.get('_mediaArray', []) + formats = [] + for num, media in enumerate(media_array): + for stream in media.get('_mediaStreamArray', []): + stream_urls = stream.get('_stream') + if not stream_urls: + continue + if not isinstance(stream_urls, list): + stream_urls = [stream_urls] + quality = stream.get('_quality') + server = stream.get('_server') + for stream_url in stream_urls: + ext = determine_ext(stream_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', + video_id, preference=-1, f4m_id='hds')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', preference=1, m3u8_id='hls')) + else: + if server and server.startswith('rtmp'): + f = { + 'url': server, + 'play_path': stream_url, + 'format_id': 'a%s-rtmp-%s' % (num, quality), + } + elif stream_url.startswith('http'): + f = { + 'url': stream_url, + 'format_id': 'a%s-%s-%s' % (num, ext, quality) + } + else: + continue + m = re.search(r'_(?P\d+)x(?P\d+)\.mp4$', stream_url) + if m: + f.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + if type_ == 'audio': + f['vcodec'] = 'none' + formats.append(f) + return formats + def _real_extract(self, url): # determine video id from url m = re.match(self._VALID_URL, url) @@ -92,46 +199,22 @@ class ARDMediathekIE(InfoExtractor): 'format_id': fid, 'url': furl, }) + self._sort_formats(formats) + info = { + 'formats': formats, + } else: # request JSON file - media_info = self._download_json( - 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id) - # The second element of the _mediaArray contains the standard http urls - streams = media_info['_mediaArray'][1]['_mediaStreamArray'] - if not streams: - if '"fsk"' in webpage: - raise ExtractorError('This video is only available after 20:00') - - formats = [] - for s in streams: - if type(s['_stream']) == list: - for index, url in enumerate(s['_stream'][::-1]): - quality = s['_quality'] + index - formats.append({ - 'quality': quality, - 'url': url, - 'format_id': '%s-%s' % (determine_ext(url), quality) - }) - continue - - format = { - 'quality': s['_quality'], - 'url': s['_stream'], - } - - format['format_id'] = '%s-%s' % ( - determine_ext(format['url']), format['quality']) + info = self._extract_media_info( + 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id) - formats.append(format) - - self._sort_formats(formats) - - return { + info.update({ 'id': video_id, 'title': title, 'description': description, - 'formats': formats, 'thumbnail': thumbnail, - } + }) + + return info class ARDIE(InfoExtractor): @@ -189,3 +272,41 @@ class ARDIE(InfoExtractor): 'upload_date': upload_date, 'thumbnail': thumbnail, } + + +class SportschauIE(ARDMediathekIE): + IE_NAME = 'Sportschau' + _VALID_URL = r'(?Phttps?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P[^/#?]+))\.html' + _TESTS = [{ + 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', + 'info_dict': { + 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', + 'ext': 'mp4', + 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + base_url = mobj.group('baseurl') + + webpage = self._download_webpage(url, video_id) + title = get_element_by_attribute('class', 'headline', webpage) + description = self._html_search_meta('description', webpage, 'description') + + info = self._extract_media_info( + base_url + '-mc_defaultQuality-h.json', webpage, video_id) + + info.update({ + 'title': title, + 'description': description, + }) + + return info diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 706ed9c99..75723c00d 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -161,7 +161,8 @@ class FranceTVIE(FranceTVBaseInfoExtractor): (?: (?:www\.)?france[2345o]\.fr/ (?: - emissions/[^/]+/(?:videos|diffusions)?| + emissions/[^/]+/(?:videos|diffusions)| + emission/[^/]+| videos| jt ) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index dc24a8a8b..cd133a10c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1196,6 +1196,12 @@ class GenericIE(InfoExtractor): if vimeo_url is not None: return self.url_result(vimeo_url) + vid_me_embed_url = self._search_regex( + r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', + webpage, 'vid.me embed', default=None) + if vid_me_embed_url is not None: + return self.url_result(vid_me_embed_url, 'Vidme') + # Look for embedded YouTube player matches = re.findall(r'''(?x) (?: diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py new file mode 100644 index 000000000..40a3d2346 --- /dev/null +++ b/youtube_dl/extractor/lecture2go.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_duration, + int_or_none, +) + + +class Lecture2GoIE(InfoExtractor): + _VALID_URL = r'https?://lecture2go\.uni-hamburg\.de/veranstaltungen/-/v/(?P\d+)' + _TEST = { + 'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473', + 'md5': 'ac02b570883020d208d405d5a3fd2f7f', + 'info_dict': { + 'id': '17473', + 'ext': 'flv', + 'title': '2 - Endliche Automaten und reguläre Sprachen', + 'creator': 'Frank Heitmann', + 'duration': 5220, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r']+class="title">(.+)', webpage, 'title') + + formats = [] + for url in set(re.findall(r'"src","([^"]+)"', webpage)): + ext = determine_ext(url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats(url, video_id)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats(url, video_id)) + else: + formats.append({ + 'url': url, + }) + + self._sort_formats(formats) + + creator = self._html_search_regex( + r']+id="description">([^<]+)', webpage, 'creator', fatal=False) + duration = parse_duration(self._html_search_regex( + r'Duration:\s*\s*]*>([^<]+)', webpage, 'duration', fatal=False)) + view_count = int_or_none(self._html_search_regex( + r'Views:\s*\s*]+>(\d+)', webpage, 'view count', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'creator': creator, + 'duration': duration, + 'view_count': view_count, + } diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py index cf495f310..6977afb27 100644 --- a/youtube_dl/extractor/snagfilms.py +++ b/youtube_dl/extractor/snagfilms.py @@ -23,6 +23,15 @@ class SnagFilmsEmbedIE(InfoExtractor): 'ext': 'mp4', 'title': '#whilewewatch', } + }, { + # invalid labels, 360p is better that 480p + 'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036', + 'md5': '882fca19b9eb27ef865efeeaed376a48', + 'info_dict': { + 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036', + 'ext': 'mp4', + 'title': 'Life in Limbo', + } }, { 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', 'only_matching': True, @@ -52,14 +61,15 @@ class SnagFilmsEmbedIE(InfoExtractor): if not file_: continue type_ = source.get('type') - format_id = source.get('label') ext = determine_ext(file_) - if any(_ == 'm3u8' for _ in (type_, ext)): + format_id = source.get('label') or ext + if all(v == 'm3u8' for v in (type_, ext)): formats.extend(self._extract_m3u8_formats( file_, video_id, 'mp4', m3u8_id='hls')) else: bitrate = int_or_none(self._search_regex( - r'(\d+)kbps', file_, 'bitrate', default=None)) + [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], + file_, 'bitrate', default=None)) height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) formats.append({ diff --git a/youtube_dl/extractor/sportschau.py b/youtube_dl/extractor/sportschau.py deleted file mode 100644 index bf9b075db..000000000 --- a/youtube_dl/extractor/sportschau.py +++ /dev/null @@ -1,47 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import get_element_by_attribute - - -class SportschauIE(InfoExtractor): - IE_NAME = 'Sportschau' - _VALID_URL = r'https?://(?:www\.)?sportschau\.de/\w+(?:/\w+)?/video(?P\w+)\.html' - _TEST = { - 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', - 'info_dict': { - 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', - 'ext': 'mp4', - 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - ext = '-mc_defaultQuality-h.json' - json_url = url[:-5] + ext - - json = self._download_json(json_url, video_id) - thumb_url = json['_previewImage'] - - m3u8_url = json['_mediaArray'][1]['_mediaStreamArray'][0]['_stream'][0] - m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, ext="mp4") - - webpage = self._download_webpage(url, video_id) - title = get_element_by_attribute('class', 'headline', webpage) - desc = self._html_search_meta('description', webpage) - - return { - 'id': video_id, - 'title': title, - 'formats': m3u8_formats, - 'description': desc, - 'thumbnail': thumb_url, - } diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 9ead13a91..3d3b635e4 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,8 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .pornhub import PornHubIE -from .vimeo import VimeoIE class TumblrIE(InfoExtractor): @@ -60,26 +58,16 @@ class TumblrIE(InfoExtractor): blog = m_url.group('blog_name') url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) - webpage = self._download_webpage(url, video_id) - - vid_me_embed_url = self._search_regex( - r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', - webpage, 'vid.me embed', default=None) - if vid_me_embed_url is not None: - return self.url_result(vid_me_embed_url, 'Vidme') - - pornhub_url = PornHubIE._extract_url(webpage) - if pornhub_url: - return self.url_result(pornhub_url, 'PornHub') - - vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) - if vimeo_url: - return self.url_result(vimeo_url, 'Vimeo') + webpage, urlh = self._download_webpage_handle(url, video_id) iframe_url = self._search_regex( r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', - webpage, 'iframe url') - iframe = self._download_webpage(iframe_url, video_id) + webpage, 'iframe url', default=None) + if iframe_url is None: + return self.url_result(urlh.geturl(), 'Generic') + + iframe = self._download_webpage(iframe_url, video_id, + 'Downloading iframe page') video_url = self._search_regex(r'[^/]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE + _VALID_URL = r'%s/(?P[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE - _TEST = { + _TESTS = [{ 'url': 'http://www.twitch.tv/shroomztv', 'info_dict': { 'id': '12772022048', @@ -331,7 +331,10 @@ class TwitchStreamIE(TwitchBaseIE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://www.twitch.tv/miracle_doto#profile-0', + 'only_matching': True, + }] def _real_extract(self, url): channel_id = self._match_id(url) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index e2bab52fe..4a0eaf65f 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -77,7 +77,11 @@ class UdemyIE(InfoExtractor): login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') - if login_popup == '
': + def is_logged(webpage): + return any(p in webpage for p in ['href="https://www.udemy.com/user/logout/', '>Logout<']) + + # already logged in + if is_logged(login_popup): return login_form = self._form_hidden_inputs('login-form', login_popup) @@ -95,8 +99,7 @@ class UdemyIE(InfoExtractor): response = self._download_webpage( request, None, 'Logging in as %s' % username) - if all(logout_pattern not in response - for logout_pattern in ['href="https://www.udemy.com/user/logout/', '>Logout<']): + if not is_logged(response): error = self._html_search_regex( r'(?s)]+class="form-errors[^"]*">(.+?)', response, 'error message', default=None) diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 04e2b0ba7..01af7a995 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -1,5 +1,4 @@ from __future__ import unicode_literals -import re from .common import InfoExtractor from .ooyala import OoyalaIE @@ -7,25 +6,29 @@ from ..utils import ExtractorError class ViceIE(InfoExtractor): - _VALID_URL = r'http://www\.vice\.com/.*?/(?P.+)' + _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P.+)' - _TEST = { - 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1', - 'info_dict': { - 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', - 'ext': 'mp4', - 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', - }, - 'params': { - # Requires ffmpeg (m3u8 manifest) - 'skip_download': True, - }, - } + _TESTS = [ + { + 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1', + 'info_dict': { + 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', + 'ext': 'mp4', + 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', + }, + 'params': { + # Requires ffmpeg (m3u8 manifest) + 'skip_download': True, + }, + }, { + 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', + 'only_matching': True, + } + ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') - webpage = self._download_webpage(url, name) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) try: embed_code = self._search_regex( r'embedCode=([^&\'"]+)', webpage, diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 1742e66f4..6ef36290b 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -1,129 +1,137 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_request +from ..compat import ( + compat_urllib_request, + compat_urllib_parse, +) +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, +) class ViewsterIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?viewster\.com/movie/(?P\d+-\d+-\d+)' + _VALID_URL = r'http://(?:www\.)?viewster\.com/(?:serie|movie)/(?P\d+-\d+-\d+)' _TESTS = [{ - # movielink, paymethod=fre - 'url': 'http://www.viewster.com/movie/1293-19341-000/hout-wood/', - 'playlist': [{ - 'md5': '8f9d94b282d80c42b378dffdbb11caf3', - 'info_dict': { - 'id': '1293-19341-000-movie', - 'ext': 'flv', - 'title': "'Hout' (Wood) - Movie", - }, - }], - 'info_dict': { - 'id': '1293-19341-000', - 'title': "'Hout' (Wood)", - 'description': 'md5:925733185a9242ef96f436937683f33b', - } - }, { - # movielink, paymethod=adv + # movie, Type=Movie 'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/', - 'playlist': [{ - 'md5': '77a005453ca7396cbe3d35c9bea30aef', - 'info_dict': { - 'id': '1140-11855-000-movie', - 'ext': 'flv', - 'title': "THE LISTENING PROJECT - Movie", - }, - }], + 'md5': '14d3cfffe66d57b41ae2d9c873416f01', 'info_dict': { 'id': '1140-11855-000', - 'title': "THE LISTENING PROJECT", - 'description': 'md5:714421ae9957e112e672551094bf3b08', - } + 'ext': 'flv', + 'title': 'The listening Project', + 'description': 'md5:bac720244afd1a8ea279864e67baa071', + 'timestamp': 1214870400, + 'upload_date': '20080701', + 'duration': 4680, + }, }, { - # direct links, no movielink - 'url': 'http://www.viewster.com/movie/1198-56411-000/sinister/', - 'playlist': [{ - 'md5': '0307b7eac6bfb21ab0577a71f6eebd8f', - 'info_dict': { - 'id': '1198-56411-000-trailer', - 'ext': 'mp4', - 'title': "Sinister - Trailer", - }, - }, { - 'md5': '80b9ee3ad69fb368f104cb5d9732ae95', - 'info_dict': { - 'id': '1198-56411-000-behind-scenes', - 'ext': 'mp4', - 'title': "Sinister - Behind Scenes", - }, - }, { - 'md5': '3b3ea897ecaa91fca57a8a94ac1b15c5', - 'info_dict': { - 'id': '1198-56411-000-scene-from-movie', - 'ext': 'mp4', - 'title': "Sinister - Scene from movie", - }, - }], + # series episode, Type=Episode + 'url': 'http://www.viewster.com/serie/1284-19427-001/the-world-and-a-wall/', + 'md5': 'd5434c80fcfdb61651cc2199a88d6ba3', 'info_dict': { - 'id': '1198-56411-000', - 'title': "Sinister", - 'description': 'md5:014c40b0488848de9683566a42e33372', - } + 'id': '1284-19427-001', + 'ext': 'flv', + 'title': 'The World and a Wall', + 'description': 'md5:24814cf74d3453fdf5bfef9716d073e3', + 'timestamp': 1428192000, + 'upload_date': '20150405', + 'duration': 1500, + }, + }, { + # serie, Type=Serie + 'url': 'http://www.viewster.com/serie/1303-19426-000/', + 'info_dict': { + 'id': '1303-19426-000', + 'title': 'Is It Wrong to Try to Pick up Girls in a Dungeon?', + 'description': 'md5:eeda9bef25b0d524b3a29a97804c2f11', + }, + 'playlist_count': 13, + }, { + # unfinished serie, no Type + 'url': 'http://www.viewster.com/serie/1284-19427-000/baby-steps-season-2/', + 'info_dict': { + 'id': '1284-19427-000', + 'title': 'Baby Steps—Season 2', + 'description': 'md5:e7097a8fc97151e25f085c9eb7a1cdb1', + }, + 'playlist_mincount': 16, }] _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' + _AUTH_TOKEN = '/YqhSYsx8EaU9Bsta3ojlA==' - def _real_extract(self, url): - video_id = self._match_id(url) - - request = compat_urllib_request.Request( - 'http://api.live.viewster.com/api/v1/movie/%s' % video_id) + def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True): + request = compat_urllib_request.Request(url) request.add_header('Accept', self._ACCEPT_HEADER) + request.add_header('Auth-token', self._AUTH_TOKEN) + return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal) - movie = self._download_json( - request, video_id, 'Downloading movie metadata JSON') - - title = movie.get('title') or movie['original_title'] - description = movie.get('synopsis') - thumbnail = movie.get('large_artwork') or movie.get('artwork') - - entries = [] - for clip in movie['play_list']: - entry = None - - # movielink api - link_request = clip.get('link_request') - if link_request: - request = compat_urllib_request.Request( - 'http://api.live.viewster.com/api/v1/movielink?movieid=%(movieid)s&action=%(action)s&paymethod=%(paymethod)s&price=%(price)s¤cy=%(currency)s&language=%(language)s&subtitlelanguage=%(subtitlelanguage)s&ischromecast=%(ischromecast)s' - % link_request) - request.add_header('Accept', self._ACCEPT_HEADER) + def _real_extract(self, url): + video_id = self._match_id(url) - movie_link = self._download_json( - request, video_id, 'Downloading movie link JSON', fatal=False) + info = self._download_json( + 'https://public-api.viewster.com/search/%s' % video_id, + video_id, 'Downloading entry JSON') - if movie_link: - formats = self._extract_f4m_formats( - movie_link['url'] + '&hdcore=3.2.0&plugin=flowplayer-3.2.0.1', video_id) - self._sort_formats(formats) - entry = { - 'formats': formats, - } + entry_id = info.get('Id') or info['id'] - # direct link - clip_url = clip.get('clip_data', {}).get('url') - if clip_url: - entry = { - 'url': clip_url, - 'ext': 'mp4', - } + # unfinished serie has no Type + if info.get('Type') in ['Serie', None]: + episodes = self._download_json( + 'https://public-api.viewster.com/series/%s/episodes' % entry_id, + video_id, 'Downloading series JSON') + entries = [ + self.url_result( + 'http://www.viewster.com/movie/%s' % episode['OriginId'], 'Viewster') + for episode in episodes] + title = (info.get('Title') or info['Synopsis']['Title']).strip() + description = info.get('Synopsis', {}).get('Detailed') + return self.playlist_result(entries, video_id, title, description) - if entry: - entry.update({ - 'id': '%s-%s' % (video_id, clip['canonical_title']), - 'title': '%s - %s' % (title, clip['title']), + formats = [] + for media_type in ('application/f4m+xml', 'application/x-mpegURL'): + media = self._download_json( + 'https://public-api.viewster.com/movies/%s/video?mediaType=%s' + % (entry_id, compat_urllib_parse.quote(media_type)), + video_id, 'Downloading %s JSON' % media_type, fatal=False) + if not media: + continue + video_url = media.get('Uri') + if not video_url: + continue + ext = determine_ext(video_url) + if ext == 'f4m': + video_url += '&' if '?' in video_url else '?' + video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', + fatal=False # m3u8 sometimes fail + )) + else: + formats.append({ + 'url': video_url, }) - entries.append(entry) + self._sort_formats(formats) - playlist = self.playlist_result(entries, video_id, title, description) - playlist['thumbnail'] = thumbnail - return playlist + synopsis = info.get('Synopsis', {}) + # Prefer title outside synopsis since it's less messy + title = (info.get('Title') or synopsis['Title']).strip() + description = synopsis.get('Detailed') or info.get('Synopsis', {}).get('Short') + duration = int_or_none(info.get('Duration')) + timestamp = parse_iso8601(info.get('ReleaseDate')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e7f5c7861..3d8b31f98 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -535,7 +535,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'dorappi2000', 'formats': 'mincount:33', }, - } + }, + # DASH manifest with segment_list + { + 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', + 'md5': '8ce563a1d667b599d21064e982ab9e31', + 'info_dict': { + 'id': 'CsmdDsKjzN8', + 'ext': 'mp4', + 'upload_date': '20150501', # According to '', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} @@ -922,6 +948,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) + if args.get('livestream') == '1' or args.get('live_playback') == 1: + is_live = True if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): # We also try looking in get_video_info since it may contain different dashmpd # URL that points to a DASH manifest with possibly different itag set (some itags @@ -1226,6 +1254,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'dislike_count': dislike_count, 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]), 'formats': formats, + 'is_live': is_live, } diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 85365d769..9016e3498 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -219,7 +219,7 @@ def parseOpts(overrideArguments=None): selection.add_option( '--playlist-items', dest='playlist_items', metavar='ITEM_SPEC', default=None, - help='Playlist video items to download. Specify indices of the videos in the playlist seperated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.') + help='Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.') selection.add_option( '--match-title', dest='matchtitle', metavar='REGEX', diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 942f76d24..ae813099d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1309,10 +1309,10 @@ def parse_duration(s): m = re.match( r'''(?ix)(?:P?T)? (?: - (?P[0-9.]+)\s*(?:mins?|minutes?)\s*| + (?P[0-9.]+)\s*(?:mins?\.?|minutes?)\s*| (?P[0-9.]+)\s*(?:hours?)| - \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*| + \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*| (?: (?: (?:(?P[0-9]+)\s*(?:[:d]|days?)\s*)? diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 3ad7a2bc0..280afdd7f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.07.18' +__version__ = '2015.07.21'