From: Philipp Hagemeister Date: Fri, 11 Jul 2014 08:54:53 +0000 (+0200) Subject: Merge remote-tracking branch 'pachacamac/vodlocker' X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=c09cbf0ed91ed54882abe6633b1e70e8a8b7db2d;hp=537ba6f3818004ef43e0067fd1be8dbd1bbeed46;p=youtube-dl Merge remote-tracking branch 'pachacamac/vodlocker' --- diff --git a/README.md b/README.md index 2bea609bf..dffdaa9dc 100644 --- a/README.md +++ b/README.md @@ -70,8 +70,9 @@ which means you can modify it, redistribute it or use it however you like. --default-search PREFIX Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large - apple". By default (with value "auto") - youtube-dl guesses. + apple". Use the value "auto" to let + youtube-dl guess. The default value "error" + just throws an error. --ignore-config Do not read configuration files. When given in the global configuration file /etc /youtube-dl.conf: do not read the user diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 4b56137ce..2bc81f020 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -69,9 +69,6 @@ class TestAllURLsMatching(unittest.TestCase): def test_youtube_show_matching(self): self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) - def test_youtube_truncated(self): - self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url']) - def test_youtube_search_matching(self): self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) diff --git a/test/test_playlists.py b/test/test_playlists.py index 42051fe2a..994b1d4b0 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -28,8 +28,9 @@ from youtube_dl.extractor import ( SoundcloudSetIE, SoundcloudUserIE, SoundcloudPlaylistIE, - TeacherTubeClassroomIE, + TeacherTubeUserIE, LivestreamIE, + LivestreamOriginalIE, NHLVideocenterIE, BambuserChannelIE, BandcampAlbumIE, @@ -155,6 +156,14 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], 'TEDCity2.0 (English)') self.assertTrue(len(result['entries']) >= 4) + def test_livestreamoriginal_folder(self): + dl = FakeYDL() + ie = LivestreamOriginalIE(dl) + result = ie.extract('https://www.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], 'a07bf706-d0e4-4e75-a747-b021d84f2fd3') + self.assertTrue(len(result['entries']) >= 28) + def test_nhl_videocenter(self): dl = FakeYDL() ie = NHLVideocenterIE(dl) @@ -370,13 +379,13 @@ class TestPlaylists(unittest.TestCase): result['title'], 'Brace Yourself - Today\'s Weirdest News') self.assertTrue(len(result['entries']) >= 10) - def test_TeacherTubeClassroom(self): + def test_TeacherTubeUser(self): dl = FakeYDL() - ie = TeacherTubeClassroomIE(dl) - result = ie.extract('http://www.teachertube.com/view_classroom.php?user=rbhagwati2') + ie = TeacherTubeUserIE(dl) + result = ie.extract('http://www.teachertube.com/user/profile/rbhagwati2') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'rbhagwati2') - self.assertTrue(len(result['entries']) >= 20) + self.assertTrue(len(result['entries']) >= 179) if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 8417c55a6..8d46fe108 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -33,6 +33,12 @@ _TESTS = [ 90, u']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876', ), + ( + u'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', + u'js', + u'2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', + u'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', + ), ] @@ -44,7 +50,7 @@ class TestSignature(unittest.TestCase): os.mkdir(self.TESTDATA_DIR) -def make_tfunc(url, stype, sig_length, expected_sig): +def make_tfunc(url, stype, sig_input, expected_sig): basename = url.rpartition('/')[2] m = re.match(r'.*-([a-zA-Z0-9_-]+)\.[a-z]+$', basename) assert m, '%r should follow URL format' % basename @@ -66,7 +72,9 @@ def make_tfunc(url, stype, sig_length, expected_sig): with open(fn, 'rb') as testf: swfcode = testf.read() func = ie._parse_sig_swf(swfcode) - src_sig = compat_str(string.printable[:sig_length]) + src_sig = ( + compat_str(string.printable[:sig_input]) + if isinstance(sig_input, int) else sig_input) got_sig = func(src_sig) self.assertEqual(got_sig, expected_sig) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index dc0ba986a..3dff723b8 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -993,6 +993,8 @@ class YoutubeDL(object): fd = get_suitable_downloader(info)(self, self.params) for ph in self._progress_hooks: fd.add_progress_hook(ph) + if self.params.get('verbose'): + self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) return fd.download(name, info) if info_dict.get('requested_formats') is not None: downloaded = [] diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 1e01432d2..31ed63fcc 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -59,6 +59,7 @@ __authors__ = ( 'Adam Thalhammer', 'Georg Jähnig', 'Ralf Haring', + 'Koki Takahashi', ) __license__ = 'Public Domain' @@ -269,7 +270,7 @@ def parseOpts(overrideArguments=None): general.add_option( '--default-search', dest='default_search', metavar='PREFIX', - help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.') + help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess. The default value "error" just throws an error.') general.add_option( '--ignore-config', action='store_true', diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1666aa372..7b3f9ae24 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -3,6 +3,7 @@ from .addanime import AddAnimeIE from .aftonbladet import AftonbladetIE from .anitube import AnitubeIE from .aol import AolIE +from .allocine import AllocineIE from .aparat import AparatIE from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE @@ -63,6 +64,7 @@ from .dailymotion import ( from .daum import DaumIE from .dotsub import DotsubIE from .dreisat import DreiSatIE +from .drtv import DRTVIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE from .divxstage import DivxStageIE @@ -147,7 +149,11 @@ from .ku6 import Ku6IE from .la7 import LA7IE from .lifenews import LifeNewsIE from .liveleak import LiveLeakIE -from .livestream import LivestreamIE, LivestreamOriginalIE +from .livestream import ( + LivestreamIE, + LivestreamOriginalIE, + LivestreamShortenerIE, +) from .lynda import ( LyndaIE, LyndaCourseIE @@ -165,6 +171,7 @@ from .mpora import MporaIE from .mofosex import MofosexIE from .mooshare import MooshareIE from .morningstar import MorningstarIE +from .motherless import MotherlessIE from .motorsport import MotorsportIE from .moviezine import MoviezineIE from .movshare import MovShareIE @@ -197,6 +204,7 @@ from .normalboots import NormalbootsIE from .novamov import NovaMovIE from .nowness import NownessIE from .nowvideo import NowVideoIE +from .npo import NPOIE from .nrk import ( NRKIE, NRKTVIE, @@ -255,6 +263,7 @@ from .soundcloud import ( SoundcloudUserIE, SoundcloudPlaylistIE ) +from .soundgasm import SoundgasmIE from .southparkstudios import ( SouthParkStudiosIE, SouthparkDeIE, @@ -274,7 +283,7 @@ from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE from .teachertube import ( TeacherTubeIE, - TeacherTubeClassroomIE, + TeacherTubeUserIE, ) from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py new file mode 100644 index 000000000..34f0cd49b --- /dev/null +++ b/youtube_dl/extractor/allocine.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_str, + qualities, + determine_ext, +) + + +class AllocineIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?Particle|video|film)/(fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=)(?P[0-9]+)(?:\.html)?' + + _TESTS = [{ + 'url': 'http://www.allocine.fr/article/fichearticle_gen_carticle=18635087.html', + 'md5': '0c9fcf59a841f65635fa300ac43d8269', + 'info_dict': { + 'id': '19546517', + 'ext': 'mp4', + 'title': 'Astérix - Le Domaine des Dieux Teaser VF', + 'description': 'md5:4a754271d9c6f16c72629a8a993ee884', + 'thumbnail': 're:http://.*\.jpg', + }, + }, { + 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html', + 'md5': 'd0cdce5d2b9522ce279fdfec07ff16e0', + 'info_dict': { + 'id': '19540403', + 'ext': 'mp4', + 'title': 'Planes 2 Bande-annonce VF', + 'description': 'md5:c4b1f7bd682a91de6491ada267ec0f4d', + 'thumbnail': 're:http://.*\.jpg', + }, + }, { + 'url': 'http://www.allocine.fr/film/fichefilm_gen_cfilm=181290.html', + 'md5': '101250fb127ef9ca3d73186ff22a47ce', + 'info_dict': { + 'id': '19544709', + 'ext': 'mp4', + 'title': 'Dragons 2 - Bande annonce finale VF', + 'description': 'md5:e74a4dc750894bac300ece46c7036490', + 'thumbnail': 're:http://.*\.jpg', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + typ = mobj.group('typ') + display_id = mobj.group('id') + + webpage = self._download_webpage(url, display_id) + + if typ == 'film': + video_id = self._search_regex(r'href="/video/player_gen_cmedia=([0-9]+).+"', webpage, 'video id') + else: + player = self._search_regex(r'data-player=\'([^\']+)\'>', webpage, 'data player') + + player_data = json.loads(player) + video_id = compat_str(player_data['refMedia']) + + xml = self._download_xml('http://www.allocine.fr/ws/AcVisiondataV4.ashx?media=%s' % video_id, display_id) + + video = xml.find('.//AcVisionVideo').attrib + quality = qualities(['ld', 'md', 'hd']) + + formats = [] + for k, v in video.items(): + if re.match(r'.+_path', k): + format_id = k.split('_')[0] + formats.append({ + 'format_id': format_id, + 'quality': quality(format_id), + 'url': v, + 'ext': determine_ext(v), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video['videoTitle'], + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + 'description': self._og_search_description(webpage), + } diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 2b019daa9..31f0d417c 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -1,22 +1,24 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor class AnitubeIE(InfoExtractor): - IE_NAME = u'anitube.se' + IE_NAME = 'anitube.se' _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P\d+)' _TEST = { - u'url': u'http://www.anitube.se/video/36621', - u'md5': u'59d0eeae28ea0bc8c05e7af429998d43', - u'file': u'36621.mp4', - u'info_dict': { - u'id': u'36621', - u'ext': u'mp4', - u'title': u'Recorder to Randoseru 01', + 'url': 'http://www.anitube.se/video/36621', + 'md5': '59d0eeae28ea0bc8c05e7af429998d43', + 'info_dict': { + 'id': '36621', + 'ext': 'mp4', + 'title': 'Recorder to Randoseru 01', + 'duration': 180.19, }, - u'skip': u'Blocked in the US', + 'skip': 'Blocked in the US', } def _real_extract(self, url): @@ -24,13 +26,15 @@ class AnitubeIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', - webpage, u'key') + key = self._html_search_regex( + r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, 'key') - config_xml = self._download_xml('http://www.anitube.se/nuevo/econfig.php?key=%s' % key, - key) + config_xml = self._download_xml( + 'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key) video_title = config_xml.find('title').text + thumbnail = config_xml.find('image').text + duration = float(config_xml.find('duration').text) formats = [] video_url = config_xml.find('file') @@ -49,5 +53,7 @@ class AnitubeIE(InfoExtractor): return { 'id': video_id, 'title': video_title, + 'thumbnail': thumbnail, + 'duration': duration, 'formats': formats } diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index b528a9ec5..9591bad8a 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -39,7 +39,10 @@ class ArteTvIE(InfoExtractor): formats = [{ 'forma_id': q.attrib['quality'], - 'url': q.text, + # The playpath starts at 'mp4:', if we don't manually + # split the url, rtmpdump will incorrectly parse them + 'url': q.text.split('mp4:', 1)[0], + 'play_path': 'mp4:' + q.text.split('mp4:', 1)[1], 'ext': 'flv', 'quality': 2 if q.attrib['quality'] == 'hd' else 1, } for q in config.findall('./urls/url')] @@ -111,7 +114,7 @@ class ArteTVPlus7IE(InfoExtractor): if not formats: # Some videos are only available in the 'Originalversion' # they aren't tagged as being in French or German - if all(f['versionCode'] == 'VO' for f in all_formats): + if all(f['versionCode'] == 'VO' or f['versionCode'] == 'VA' for f in all_formats): formats = all_formats else: raise ExtractorError(u'The formats list is empty') @@ -189,9 +192,10 @@ class ArteTVFutureIE(ArteTVPlus7IE): _TEST = { 'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', 'info_dict': { - 'id': '050940-003', + 'id': '5201', 'ext': 'mp4', 'title': 'Les champignons au secours de la planète', + 'upload_date': '20131101', }, } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 49e75405e..e4e4feef9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -459,6 +459,9 @@ class InfoExtractor(object): if secure: regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) + def _og_search_url(self, html, **kargs): + return self._og_search_property('url', html, **kargs) + def _html_search_meta(self, name, html, display_name=None, fatal=False): if display_name is None: display_name = name diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 55216201f..5d0bfe454 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -150,7 +150,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): return { 'id': video_id, 'formats': formats, - 'uploader': info['owner_screenname'], + 'uploader': info['owner.screenname'], 'upload_date': video_upload_date, 'title': self._og_search_title(webpage), 'subtitles': video_subtitles, diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py new file mode 100644 index 000000000..cdccfd376 --- /dev/null +++ b/youtube_dl/extractor/drtv.py @@ -0,0 +1,91 @@ +from __future__ import unicode_literals + +import re + +from .subtitles import SubtitlesInfoExtractor +from .common import ExtractorError +from ..utils import parse_iso8601 + + +class DRTVIE(SubtitlesInfoExtractor): + _VALID_URL = r'http://(?:www\.)?dr\.dk/tv/se/[^/]+/(?P[\da-z-]+)' + + _TEST = { + 'url': 'http://www.dr.dk/tv/se/partiets-mand/partiets-mand-7-8', + 'md5': '4a7e1dd65cdb2643500a3f753c942f25', + 'info_dict': { + 'id': 'partiets-mand-7-8', + 'ext': 'mp4', + 'title': 'Partiets mand (7:8)', + 'description': 'md5:a684b90a8f9336cd4aab94b7647d7862', + 'timestamp': 1403047940, + 'upload_date': '20140617', + 'duration': 1299.040, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + programcard = self._download_json( + 'http://www.dr.dk/mu/programcard/expanded/%s' % video_id, video_id, 'Downloading video JSON') + + data = programcard['Data'][0] + + title = data['Title'] + description = data['Description'] + timestamp = parse_iso8601(data['CreatedTime'][:-5]) + + thumbnail = None + duration = None + + restricted_to_denmark = False + + formats = [] + subtitles = {} + + for asset in data['Assets']: + if asset['Kind'] == 'Image': + thumbnail = asset['Uri'] + elif asset['Kind'] == 'VideoResource': + duration = asset['DurationInMilliseconds'] / 1000.0 + restricted_to_denmark = asset['RestrictedToDenmark'] + for link in asset['Links']: + target = link['Target'] + uri = link['Uri'] + formats.append({ + 'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri, + 'format_id': target, + 'ext': link['FileFormat'], + 'preference': -1 if target == 'HDS' else -2, + }) + subtitles_list = asset.get('SubtitlesList') + if isinstance(subtitles_list, list): + LANGS = { + 'Danish': 'dk', + } + for subs in subtitles_list: + lang = subs['Language'] + subtitles[LANGS.get(lang, lang)] = subs['Uri'] + + if not formats and restricted_to_denmark: + raise ExtractorError( + 'Unfortunately, DR is not allowed to show this program outside Denmark.', expected=True) + + self._sort_formats(formats) + + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, subtitles) + return + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'subtitles': self.extract_subtitles(video_id, subtitles), + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9dd03aba4..f97b59845 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -383,7 +383,7 @@ class GenericIE(InfoExtractor): if not parsed_url.scheme: default_search = self._downloader.params.get('default_search') if default_search is None: - default_search = 'auto_warning' + default_search = 'error' if default_search in ('auto', 'auto_warning'): if '/' in url: @@ -397,8 +397,13 @@ class GenericIE(InfoExtractor): expected=True) else: self._downloader.report_warning( - 'Falling back to youtube search for %s . Set --default-search to "auto" to suppress this warning.' % url) + 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url) return self.url_result('ytsearch:' + url) + elif default_search == 'error': + raise ExtractorError( + ('%r is not a valid URL. ' + 'Set --default-search "ytseach" (or run youtube-dl "ytsearch:%s" ) to search YouTube' + ) % (url, url), expected=True) else: assert ':' in default_search return self.url_result(default_search + url) @@ -620,6 +625,11 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'VK') + # Look for embedded ivi player + mobj = re.search(r']+?src=(["\'])(?Phttps?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Ivi') + # Look for embedded Huffington Post player mobj = re.search( r']+?src=(["\'])(?Phttps?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index cc29a7e5d..07d994b44 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -52,8 +52,7 @@ class GooglePlusIE(InfoExtractor): # Extract title # Get the first line for title - video_title = self._html_search_regex(r'[^/]+))?/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?ivi\.ru/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P\d+)' _TESTS = [ # Single movie diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 5c71f4f09..2c100d424 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -9,6 +9,7 @@ from ..utils import ( compat_urlparse, xpath_with_ns, compat_str, + orderedSet, ) @@ -64,7 +65,10 @@ class LivestreamIE(InfoExtractor): # The original version of Livestream uses a different system class LivestreamOriginalIE(InfoExtractor): IE_NAME = 'livestream:original' - _VALID_URL = r'https?://www\.livestream\.com/(?P[^/]+)/video\?.*?clipId=(?P.*?)(&|$)' + _VALID_URL = r'''(?x)https?://www\.livestream\.com/ + (?P[^/]+)/(?Pvideo|folder) + (?:\?.*?Id=|/)(?P.*?)(&|$) + ''' _TEST = { 'url': 'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', 'info_dict': { @@ -78,10 +82,7 @@ class LivestreamOriginalIE(InfoExtractor): }, } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - user = mobj.group('user') + def _extract_video(self, user, video_id): api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id) info = self._download_xml(api_url, video_id) @@ -99,3 +100,44 @@ class LivestreamOriginalIE(InfoExtractor): 'ext': 'flv', 'thumbnail': thumbnail_url, } + + def _extract_folder(self, url, folder_id): + webpage = self._download_webpage(url, folder_id) + urls = orderedSet(re.findall(r'.+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + id = mobj.group('id') + webpage = self._download_webpage(url, id) + + return { + '_type': 'url', + 'url': self._og_search_url(webpage), + } diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py new file mode 100644 index 000000000..6229b2173 --- /dev/null +++ b/youtube_dl/extractor/motherless.py @@ -0,0 +1,87 @@ +from __future__ import unicode_literals + +import datetime +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, +) + + +class MotherlessIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?motherless\.com/(?P[A-Z0-9]+)' + _TESTS = [ + { + 'url': 'http://motherless.com/AC3FFE1', + 'md5': '5527fef81d2e529215dad3c2d744a7d9', + 'info_dict': { + 'id': 'AC3FFE1', + 'ext': 'flv', + 'title': 'Fucked in the ass while playing PS3', + 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], + 'upload_date': '20100913', + 'uploader_id': 'famouslyfuckedup', + 'thumbnail': 're:http://.*\.jpg', + 'age_limit': 18, + } + }, + { + 'url': 'http://motherless.com/532291B', + 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', + 'info_dict': { + 'id': '532291B', + 'ext': 'mp4', + 'title': 'Amazing girl playing the omegle game, PERFECT!', + 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', 'game', 'hairy'], + 'upload_date': '20140622', + 'uploader_id': 'Sulivana7x', + 'thumbnail': 're:http://.*\.jpg', + 'age_limit': 18, + } + } + ] + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') + + video_url = self._html_search_regex(r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video_url') + age_limit = self._rta_search(webpage) + + view_count = self._html_search_regex(r'Views\s+([^<]+)<', webpage, 'view_count') + + upload_date = self._html_search_regex(r'Uploaded\s+([^<]+)<', webpage, 'upload_date') + if 'Ago' in upload_date: + days = int(re.search(r'([0-9]+)', upload_date).group(1)) + upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') + else: + upload_date = unified_strdate(upload_date) + + like_count = self._html_search_regex(r'Favorited\s+([^<]+)<', webpage, 'like_count') + + comment_count = webpage.count('class="media-comment-contents"') + uploader_id = self._html_search_regex(r'"thumb-member-username">\s+.+)' _TEST = { - 'url': 'http://newstube.ru/media/na-korable-progress-prodolzhaetsya-testirovanie-sistemy-kurs', + 'url': 'http://www.newstube.ru/media/telekanal-cnn-peremestil-gorod-slavyansk-v-krym', 'info_dict': { - 'id': 'd156a237-a6e9-4111-a682-039995f721f1', + 'id': '728e0ef2-e187-4012-bac0-5a081fdcb1f6', 'ext': 'flv', - 'title': 'На корабле «Прогресс» продолжается тестирование системы «Курс»', - 'description': 'md5:d0cbe7b4a6f600552617e48548d5dc77', - 'duration': 20.04, + 'title': 'Телеканал CNN переместил город Славянск в Крым', + 'description': 'md5:419a8c9f03442bc0b0a794d689360335', + 'duration': 31.05, }, 'params': { # rtmp download @@ -40,6 +41,10 @@ class NewstubeIE(InfoExtractor): def ns(s): return s.replace('/', '/%(ns)s') % {'ns': '{http://app1.newstube.ru/N2SiteWS/player.asmx}'} + error_message = player.find(ns('./ErrorMessage')) + if error_message is not None: + raise ExtractorError('%s returned error: %s' % (self.IE_NAME, error_message.text), expected=True) + session_id = player.find(ns('./SessionId')).text media_info = player.find(ns('./Medias/MediaInfo')) title = media_info.find(ns('./Name')).text diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 517a72561..c0c139b5d 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -8,10 +8,9 @@ from ..utils import ( compat_urllib_parse, compat_urllib_request, compat_urlparse, - compat_str, - - ExtractorError, unified_strdate, + parse_duration, + int_or_none, ) @@ -30,6 +29,7 @@ class NiconicoIE(InfoExtractor): 'uploader_id': '2698420', 'upload_date': '20131123', 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', + 'duration': 33, }, 'params': { 'username': 'ydl.niconico@gmail.com', @@ -37,17 +37,20 @@ class NiconicoIE(InfoExtractor): }, } - _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' + _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' + # Determine whether the downloader uses authentication to download video + _AUTHENTICATE = False def _real_initialize(self): - self._login() + if self._downloader.params.get('username', None) is not None: + self._AUTHENTICATE = True + + if self._AUTHENTICATE: + self._login() def _login(self): (username, password) = self._get_login_info() - if username is None: - # Login is required - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) # Log in login_form_strs = { @@ -79,44 +82,66 @@ class NiconicoIE(InfoExtractor): 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, note='Downloading video info page') - # Get flv info - flv_info_webpage = self._download_webpage( - 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, - video_id, 'Downloading flv info') + if self._AUTHENTICATE: + # Get flv info + flv_info_webpage = self._download_webpage( + 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, + video_id, 'Downloading flv info') + else: + # Get external player info + ext_player_info = self._download_webpage( + 'http://ext.nicovideo.jp/thumb_watch/' + video_id, video_id) + thumb_play_key = self._search_regex( + r'\'thumbPlayKey\'\s*:\s*\'(.*?)\'', ext_player_info, 'thumbPlayKey') + + # Get flv info + flv_info_data = compat_urllib_parse.urlencode({ + 'k': thumb_play_key, + 'v': video_id + }) + flv_info_request = compat_urllib_request.Request( + 'http://ext.nicovideo.jp/thumb_watch', flv_info_data, + {'Content-Type': 'application/x-www-form-urlencoded'}) + flv_info_webpage = self._download_webpage( + flv_info_request, video_id, + note='Downloading flv info', errnote='Unable to download flv info') + video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0] # Start extracting information - video_title = video_info.find('.//title').text - video_extension = video_info.find('.//movie_type').text - video_format = video_extension.upper() - video_thumbnail = video_info.find('.//thumbnail_url').text - video_description = video_info.find('.//description').text - video_uploader_id = video_info.find('.//user_id').text - video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) - video_view_count = video_info.find('.//view_counter').text - video_webpage_url = video_info.find('.//watch_url').text - - # uploader - video_uploader = video_uploader_id - url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id - try: - user_info = self._download_xml( - url, video_id, note='Downloading user information') - video_uploader = user_info.find('.//nickname').text - except ExtractorError as err: - self._downloader.report_warning('Unable to download user info webpage: %s' % compat_str(err)) + title = video_info.find('.//title').text + extension = video_info.find('.//movie_type').text + video_format = extension.upper() + thumbnail = video_info.find('.//thumbnail_url').text + description = video_info.find('.//description').text + upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) + view_count = int_or_none(video_info.find('.//view_counter').text) + comment_count = int_or_none(video_info.find('.//comment_num').text) + duration = parse_duration(video_info.find('.//length').text) + webpage_url = video_info.find('.//watch_url').text + + if video_info.find('.//ch_id') is not None: + uploader_id = video_info.find('.//ch_id').text + uploader = video_info.find('.//ch_name').text + elif video_info.find('.//user_id') is not None: + uploader_id = video_info.find('.//user_id').text + uploader = video_info.find('.//user_nickname').text + else: + uploader_id = uploader = None return { 'id': video_id, 'url': video_real_url, - 'title': video_title, - 'ext': video_extension, + 'title': title, + 'ext': extension, 'format': video_format, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'uploader': video_uploader, - 'upload_date': video_upload_date, - 'uploader_id': video_uploader_id, - 'view_count': video_view_count, - 'webpage_url': video_webpage_url, + 'thumbnail': thumbnail, + 'description': description, + 'uploader': uploader, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'comment_count': comment_count, + 'duration': duration, + 'webpage_url': webpage_url, } diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index c2e7b67c7..33daa0dec 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -47,7 +47,7 @@ class NineGagIE(InfoExtractor): webpage = self._download_webpage(url, display_id) post_view = json.loads(self._html_search_regex( - r'var postView = new app\.PostView\({\s*post:\s*({.+?}),', webpage, 'post view')) + r'var postView = new app\.PostView\({\s*post:\s*({.+?}),\s*posts:\s*prefetchedCurrentPost', webpage, 'post view')) youtube_id = post_view['videoExternalId'] title = post_view['title'] diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py new file mode 100644 index 000000000..fbcbe1f40 --- /dev/null +++ b/youtube_dl/extractor/npo.py @@ -0,0 +1,62 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + unified_strdate, +) + + +class NPOIE(InfoExtractor): + IE_NAME = 'npo.nl' + _VALID_URL = r'https?://www\.npo\.nl/[^/]+/[^/]+/(?P[^/?]+)' + + _TEST = { + 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', + 'md5': '4b3f9c429157ec4775f2c9cb7b911016', + 'info_dict': { + 'id': 'VPWON_1220719', + 'ext': 'mp4', + 'title': 'Nieuwsuur', + 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', + 'upload_date': '20140622', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + metadata = self._download_json( + 'http://e.omroep.nl/metadata/aflevering/%s' % video_id, + video_id, + # We have to remove the javascript callback + transform_source=lambda j: re.sub(r'parseMetadata\((.*?)\);\n//epc', r'\1', j) + ) + token_page = self._download_webpage( + 'http://ida.omroep.nl/npoplayer/i.js', + video_id, + note='Downloading token' + ) + token = self._search_regex(r'npoplayer.token = "(.+?)"', token_page, 'token') + streams_info = self._download_json( + 'http://ida.omroep.nl/odi/?prid=%s&puboptions=h264_std&adaptive=yes&token=%s' % (video_id, token), + video_id + ) + + stream_info = self._download_json( + streams_info['streams'][0] + '&type=json', + video_id, + 'Downloading stream info' + ) + + return { + 'id': video_id, + 'title': metadata['titel'], + 'ext': 'mp4', + 'url': stream_info['url'], + 'description': metadata['info'], + 'thumbnail': metadata['images'][-1]['url'], + 'upload_date': unified_strdate(metadata['gidsdatum']), + } diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index cb4305349..ba3dd707f 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -35,7 +35,8 @@ class RaiIE(SubtitlesInfoExtractor): 'description': '', 'upload_date': '20140612', 'duration': 1758, - } + }, + 'skip': 'Error 404', }, { 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py new file mode 100644 index 000000000..a4f8ce6c3 --- /dev/null +++ b/youtube_dl/extractor/soundgasm.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SoundgasmIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[0-9a-zA-Z_\-]+)/(?P[0-9a-zA-Z_\-]+)' + _TEST = { + 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', + 'md5': '010082a2c802c5275bb00030743e75ad', + 'info_dict': { + 'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9', + 'ext': 'm4a', + 'title': 'ytdl_Piano-sample', + 'description': 'Royalty Free Sample Music' + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('title') + audio_title = mobj.group('user') + '_' + mobj.group('title') + webpage = self._download_webpage(url, display_id) + audio_url = self._html_search_regex( + r'(?s)m4a\:\s"([^"]+)"', webpage, 'audio URL') + audio_id = re.split('\/|\.', audio_url)[-2] + description = self._html_search_regex( + r'(?s)<li>Description:\s(.*?)<\/li>', webpage, 'description', + fatal=False) + + return { + 'id': audio_id, + 'display_id': display_id, + 'url': audio_url, + 'title': audio_title, + 'description': description + } diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 36331529e..25b9864ad 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -20,13 +20,13 @@ class TagesschauIE(InfoExtractor): 'thumbnail': 're:^http:.*\.jpg$', }, }, { - 'url': 'http://www.tagesschau.de/multimedia/video/video-196.html', - 'md5': '8aaa8bf3ae1ca2652309718c03019128', + 'url': 'http://www.tagesschau.de/multimedia/video/video-5964.html', + 'md5': '66652566900963a3f962333579eeffcf', 'info_dict': { - 'id': '196', + 'id': '5964', 'ext': 'mp4', - 'title': 'Ukraine-Konflikt: Klitschko in Kiew als Bürgermeister vereidigt', - 'description': 'md5:f22e4af75821d174fa6c977349682691', + 'title': 'Nahost-Konflikt: Israel bombadiert Ziele im Gazastreifen und Westjordanland', + 'description': 'md5:07bfc78c48eec3145ed4805299a1900a', 'thumbnail': 're:http://.*\.jpg', }, }] diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index b3cb6bd76..2c2113b14 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -14,7 +14,7 @@ class TeacherTubeIE(InfoExtractor): IE_NAME = 'teachertube' IE_DESC = 'teachertube.com videos' - _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=)(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=|video/(?:[\da-z-]+-)?|audio/)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997', @@ -45,6 +45,15 @@ class TeacherTubeIE(InfoExtractor): 'title': 'PER ASPERA AD ASTRA', 'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNI?KE ?KOLE P', }, + }, { + 'url': 'http://www.teachertube.com/video/intro-video-schleicher-297790', + 'md5': '9c79fbb2dd7154823996fc28d4a26998', + 'info_dict': { + 'id': '297790', + 'ext': 'mp4', + 'title': 'Intro Video - Schleicher', + 'description': 'Intro Video - Why to flip, how flipping will', + }, }] def _real_extract(self, url): @@ -66,6 +75,7 @@ class TeacherTubeIE(InfoExtractor): media_urls = re.findall(r'data-contenturl="([^"]+)"', webpage) media_urls.extend(re.findall(r'var\s+filePath\s*=\s*"([^"]+)"', webpage)) + media_urls.extend(re.findall(r'\'file\'\s*:\s*["\']([^"\']+)["\'],', webpage)) formats = [ { @@ -79,28 +89,36 @@ class TeacherTubeIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'thumbnail': self._html_search_regex(r'var\s+thumbUrl\s*=\s*"([^"]+)"', webpage, 'thumbnail'), + 'thumbnail': self._html_search_regex(r'\'image\'\s*:\s*["\']([^"\']+)["\']', webpage, 'thumbnail'), 'formats': formats, 'description': description, } -class TeacherTubeClassroomIE(InfoExtractor): - IE_NAME = 'teachertube:classroom' - IE_DESC = 'teachertube.com online classrooms' +class TeacherTubeUserIE(InfoExtractor): + IE_NAME = 'teachertube:user:collection' + IE_DESC = 'teachertube.com user and collection videos' + + _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P<user>[0-9a-zA-Z]+)/?' - _VALID_URL = r'https?://(?:www\.)?teachertube\.com/view_classroom\.php\?user=(?P<user>[0-9a-zA-Z]+)' + _MEDIA_RE = r'(?s)"sidebar_thumb_time">[0-9:]+</div>.+?<a href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)">' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('user') - rss = self._download_xml( - 'http://www.teachertube.com/rssclassroom.php?mode=user&username=%s' % user_id, - user_id, 'Downloading classroom RSS') + urls = [] + webpage = self._download_webpage(url, user_id) + urls.extend(re.findall(self._MEDIA_RE, webpage)) + + pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[1:-1] + for p in pages: + more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p) + webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages) + 1)) + urls.extend(re.findall(self._MEDIA_RE, webpage)) entries = [] - for url in rss.findall('.//{http://search.yahoo.com/mrss/}player'): - entries.append(self.url_result(url.attrib['url'], 'TeacherTube')) + for url in urls: + entries.append(self.url_result(url, 'TeacherTube')) return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py index 34008afc6..0f389bd93 100644 --- a/youtube_dl/extractor/toypics.py +++ b/youtube_dl/extractor/toypics.py @@ -1,10 +1,13 @@ +# -*- coding:utf-8 -*- +from __future__ import unicode_literals + from .common import InfoExtractor import re class ToypicsIE(InfoExtractor): IE_DESC = 'Toypics user profile' - _VALID_URL = r'http://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*' + _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*' _TEST = { 'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/', 'md5': '16e806ad6d6f58079d210fe30985e08b', @@ -61,7 +64,7 @@ class ToypicsUserIE(InfoExtractor): note='Downloading page %d/%d' % (n, page_count)) urls.extend( re.findall( - r'<p class="video-entry-title">\n\s*<a href="(http://videos.toypics.net/view/[^"]+)">', + r'<p class="video-entry-title">\s+<a href="(https?://videos.toypics.net/view/[^"]+)">', lpage)) return { diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 544369068..2882c1809 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals import re @@ -10,14 +11,27 @@ from ..utils import ( class TumblrIE(InfoExtractor): _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)($|/)' - _TEST = { + _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', - 'file': '54196191430.mp4', 'md5': '479bb068e5b16462f5176a6828829767', 'info_dict': { - "title": "tatiana maslany news" + 'id': '54196191430', + 'ext': 'mp4', + 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', + 'description': 'md5:dfac39636969fe6bf1caa2d50405f069', + 'thumbnail': 're:http://.*\.jpg', } - } + }, { + 'url': 'http://5sostrum.tumblr.com/post/90208453769/yall-forgetting-the-greatest-keek-of-them-all', + 'md5': 'bf348ef8c0ef84fbf1cbd6fa6e000359', + 'info_dict': { + 'id': '90208453769', + 'ext': 'mp4', + 'title': '5SOS STRUM ;)', + 'description': 'md5:dba62ac8639482759c8eb10ce474586a', + 'thumbnail': 're:http://.*\.jpg', + } + }] def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) @@ -48,6 +62,7 @@ class TumblrIE(InfoExtractor): return [{'id': video_id, 'url': video_url, 'title': video_title, + 'description': self._html_search_meta('description', webpage), 'thumbnail': video_thumbnail, 'ext': ext }] diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index fb132aef6..a7953a7e7 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -49,6 +49,7 @@ class VeohIE(InfoExtractor): 'description': 'md5:f5a11c51f8fb51d2315bca0937526891', 'uploader': 'newsy-videos', }, + 'skip': 'This video has been deleted.', }, ] diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index b5034b02f..a647807d0 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -4,7 +4,10 @@ import re import base64 from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + unified_strdate, + int_or_none, +) class VideoTtIE(InfoExtractor): @@ -50,9 +53,9 @@ class VideoTtIE(InfoExtractor): 'thumbnail': settings['config']['thumbnail'], 'upload_date': unified_strdate(video['added']), 'uploader': video['owner'], - 'view_count': int(video['view_count']), - 'comment_count': int(video['comment_count']), - 'like_count': int(video['liked']), - 'dislike_count': int(video['disliked']), + 'view_count': int_or_none(video['view_count']), + 'comment_count': None if video.get('comment_count') == '--' else int_or_none(video['comment_count']), + 'like_count': int_or_none(video['liked']), + 'dislike_count': int_or_none(video['disliked']), 'formats': formats, } \ No newline at end of file diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index fb082f364..918bd1098 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -16,7 +16,7 @@ from ..utils import ( class VKIE(InfoExtractor): IE_NAME = 'vk.com' - _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))' + _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:.+?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))' _NETRC_MACHINE = 'vk' _TESTS = [ @@ -27,7 +27,7 @@ class VKIE(InfoExtractor): 'id': '162222515', 'ext': 'flv', 'title': 'ProtivoGunz - Хуёвая песня', - 'uploader': 'Noize MC', + 'uploader': 're:Noize MC.*', 'duration': 195, }, }, @@ -62,11 +62,47 @@ class VKIE(InfoExtractor): 'id': '164049491', 'ext': 'mp4', 'uploader': 'Триллеры', - 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]\u00a0', + 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', 'duration': 8352, }, 'skip': 'Requires vk account credentials', }, + { + 'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', + 'md5': 'd82c22e449f036282d1d3f7f4d276869', + 'info_dict': { + 'id': '166094326', + 'ext': 'mp4', + 'uploader': 'Киномания - лучшее из мира кино', + 'title': 'Запах женщины (1992)', + 'duration': 9392, + }, + 'skip': 'Requires vk account credentials', + }, + { + 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', + 'md5': '4d7a5ef8cf114dfa09577e57b2993202', + 'info_dict': { + 'id': '168067957', + 'ext': 'mp4', + 'uploader': 'Киномания - лучшее из мира кино', + 'title': ' ', + 'duration': 7291, + }, + 'skip': 'Requires vk account credentials', + }, + { + 'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540', + 'md5': '0c45586baa71b7cb1d0784ee3f4e00a6', + 'note': 'ivi.ru embed', + 'info_dict': { + 'id': '60690', + 'ext': 'mp4', + 'title': 'Книга Илая', + 'duration': 6771, + }, + 'skip': 'Only works from Russia', + }, ] def _login(self): @@ -110,6 +146,16 @@ class VKIE(InfoExtractor): if m_yt is not None: self.to_screen('Youtube video detected') return self.url_result(m_yt.group(1), 'Youtube') + + m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page) + if m_opts: + m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1)) + if m_opts_url: + opts_url = m_opts_url.group(1) + if opts_url.startswith('//'): + opts_url = 'http:' + opts_url + return self.url_result(opts_url) + data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars') data = json.loads(data_json) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index feeb44b45..f741ba540 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals import re @@ -54,14 +55,14 @@ class WDRIE(InfoExtractor): }, }, { - 'url': 'http://www.funkhauseuropa.de/av/audiogrenzenlosleckerbaklava101-audioplayer.html', - 'md5': 'cfff440d4ee64114083ac44676df5d15', + 'url': 'http://www.funkhauseuropa.de/av/audiosuepersongsoulbossanova100-audioplayer.html', + 'md5': '24e83813e832badb0a8d7d1ef9ef0691', 'info_dict': { - 'id': 'mdb-363068', + 'id': 'mdb-463528', 'ext': 'mp3', - 'title': 'Grenzenlos lecker - Baklava', + 'title': 'Süpersong: Soul Bossa Nova', 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', - 'upload_date': '20140311', + 'upload_date': '20140630', }, }, ] @@ -127,9 +128,10 @@ class WDRMobileIE(InfoExtractor): 'info_dict': { 'title': '4283021', 'id': '421735', + 'ext': 'mp4', 'age_limit': 0, }, - '_skip': 'Will be depublicized shortly' + 'skip': 'Problems with loading data.' } def _real_extract(self, url): @@ -139,6 +141,7 @@ class WDRMobileIE(InfoExtractor): 'title': mobj.group('title'), 'age_limit': int(mobj.group('age_limit')), 'url': url, + 'ext': determine_ext(url), 'user_agent': 'mobile', } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6bdea1c44..6123e1256 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -865,71 +865,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _decrypt_signature(self, s, video_id, player_url, age_gate=False): """Turn the encrypted s field into a working signature""" - if player_url is not None: - if player_url.startswith(u'//'): - player_url = u'https:' + player_url - try: - player_id = (player_url, len(s)) - if player_id not in self._player_cache: - func = self._extract_signature_function( - video_id, player_url, len(s) - ) - self._player_cache[player_id] = func - func = self._player_cache[player_id] - if self._downloader.params.get('youtube_print_sig_code'): - self._print_sig_code(func, len(s)) - return func(s) - except Exception: - tb = traceback.format_exc() - self._downloader.report_warning( - u'Automatic signature extraction failed: ' + tb) - - self._downloader.report_warning( - u'Warning: Falling back to static signature algorithm') - - return self._static_decrypt_signature( - s, video_id, player_url, age_gate) - - def _static_decrypt_signature(self, s, video_id, player_url, age_gate): - if age_gate: - # The videos with age protection use another player, so the - # algorithms can be different. - if len(s) == 86: - return s[2:63] + s[82] + s[64:82] + s[63] - - if len(s) == 93: - return s[86:29:-1] + s[88] + s[28:5:-1] - elif len(s) == 92: - return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] - elif len(s) == 91: - return s[84:27:-1] + s[86] + s[26:5:-1] - elif len(s) == 90: - return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] - elif len(s) == 89: - return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1] - elif len(s) == 88: - return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28] - elif len(s) == 87: - return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] - elif len(s) == 86: - return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1] - elif len(s) == 85: - return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84] - elif len(s) == 84: - return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1] - elif len(s) == 83: - return s[80:63:-1] + s[0] + s[62:0:-1] + s[63] - elif len(s) == 82: - return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37] - elif len(s) == 81: - return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] - elif len(s) == 80: - return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80] - elif len(s) == 79: - return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] + if player_url is None: + raise ExtractorError(u'Cannot decrypt signature without player_url') - else: - raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) + if player_url.startswith(u'//'): + player_url = u'https:' + player_url + try: + player_id = (player_url, len(s)) + if player_id not in self._player_cache: + func = self._extract_signature_function( + video_id, player_url, len(s) + ) + self._player_cache[player_id] = func + func = self._player_cache[player_id] + if self._downloader.params.get('youtube_print_sig_code'): + self._print_sig_code(func, len(s)) + return func(s) + except Exception as e: + tb = traceback.format_exc() + raise ExtractorError( + u'Automatic signature extraction failed: ' + tb, cause=e) def _get_available_subtitles(self, video_id, webpage): try: @@ -1698,14 +1653,14 @@ class YoutubeSearchURLIE(InfoExtractor): webpage = self._download_webpage(url, query) result_code = self._search_regex( - r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML') + r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML') part_codes = re.findall( r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code) entries = [] for part_code in part_codes: part_title = self._html_search_regex( - r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False) + [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False) part_url_snippet = self._html_search_regex( r'(?s)href="([^"]+)"', part_code, 'item URL') part_url = compat_urlparse.urljoin( @@ -1825,10 +1780,21 @@ class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list _VALID_URL = r'''(?x) - (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$| + (?:https?://)?[^/]+/watch\?(?: + feature=[a-z_]+| + annotation_id=annotation_[^&]+ + )?$| (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$ ''' + _TESTS = [{ + 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041', + 'only_matching': True, + }, { + 'url': 'http://www.youtube.com/watch?', + 'only_matching': True, + }] + def _real_extract(self, url): raise ExtractorError( u'Did you forget to quote the URL? Remember that & is a meta ' diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 449482d3c..3bbb07704 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -59,7 +59,7 @@ class JSInterpreter(object): if member == 'split("")': return list(val) if member == 'join("")': - return u''.join(val) + return ''.join(val) if member == 'length': return len(val) if member == 'reverse()': @@ -99,7 +99,7 @@ class JSInterpreter(object): def extract_function(self, funcname): func_m = re.search( - (r'(?:function %s|%s\s*=\s*function)' % ( + (r'(?:function %s|[{;]%s\s*=\s*function)' % ( re.escape(funcname), re.escape(funcname))) + r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', self.code) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b97e62ae9..09312e81a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -816,6 +816,9 @@ def unified_strdate(date_str): '%d %b %Y', '%B %d %Y', '%b %d %Y', + '%b %dst %Y %I:%M%p', + '%b %dnd %Y %I:%M%p', + '%b %dth %Y %I:%M%p', '%Y-%m-%d', '%d.%m.%Y', '%d/%m/%Y', diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 77f6083d5..d6b05892c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.06.24.1' +__version__ = '2014.07.11'