From: Sergey M․ Date: Tue, 7 Oct 2014 13:24:52 +0000 (+0700) Subject: Merge branch 'walla' of https://github.com/lenaten/youtube-dl into lenaten-walla X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=c59c3c84ede823e5c97f695ae904545c615e4ded;hp=31d06400ecee79ecf3a0bd38c0702f165fd6d958;p=youtube-dl Merge branch 'walla' of https://github.com/lenaten/youtube-dl into lenaten-walla --- diff --git a/README.md b/README.md index 5d15decb5..cabc5eb9a 100644 --- a/README.md +++ b/README.md @@ -348,21 +348,34 @@ $ youtube-dl --dateafter 20000101 --datebefore 20091231 # FAQ -### I'm getting an error `Unable to extract OpenGraph title` on YouTube playlists +### How do I update youtube-dl? -YouTube changed their playlist format in March 2014 and later on, so you'll need at least youtube-dl 2014.07.25 to download all YouTube videos. +If you've followed [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`). + +If you have used pip, a simple `sudo pip install -U youtube-dl` is sufficient to update. -If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to report bugs to the Ubuntu packaging guys - all they have to do is update the package to a somewhat recent version. +If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distributions serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. -Alternatively, uninstall the youtube-dl package and follow [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html). In a pinch, this should do if you used `apt-get` before to install youtube-dl: +As a last resort, you can also uninstall the version installed by your package manager and follow our manual installation instructions. For that, remove the distribution's package, with a line like + + sudo apt-get remove -y youtube-dl + +Afterwards, simply follow [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html): ``` -sudo apt-get remove -y youtube-dl sudo wget https://yt-dl.org/latest/youtube-dl -O /usr/local/bin/youtube-dl sudo chmod a+x /usr/local/bin/youtube-dl hash -r ``` +Again, from then on you'll be able to update with `sudo youtube-dl -U`. + +### I'm getting an error `Unable to extract OpenGraph title` on YouTube playlists + +YouTube changed their playlist format in March 2014 and later on, so you'll need at least youtube-dl 2014.07.25 to download all YouTube videos. + +If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to report bugs to the Ubuntu packaging guys - all they have to do is update the package to a somewhat recent version. See above for a way to update. + ### Do I always have to pass in `--max-quality FORMAT`, or `-citw`? By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, `--max-quality` *limits* the video quality (so if you want the best quality, do NOT pass it in), and the only option out of `-citw` that is regularly useful is `-i`. @@ -442,8 +455,6 @@ If you want to add support for a new site, you can follow this quick list (assum # coding: utf-8 from __future__ import unicode_literals - import re - from .common import InfoExtractor @@ -451,7 +462,7 @@ If you want to add support for a new site, you can follow this quick list (assum _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' _TEST = { 'url': 'http://yourextractor.com/watch/42', - 'md5': 'TODO: md5 sum of the first 10KiB of the video file', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { 'id': '42', 'ext': 'mp4', @@ -466,8 +477,7 @@ If you want to add support for a new site, you can follow this quick list (assum } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) # TODO more code goes here, for example ... webpage = self._download_webpage(url, video_id) diff --git a/test/helper.py b/test/helper.py index 7f3ab8438..62cb3ce02 100644 --- a/test/helper.py +++ b/test/helper.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import errno import io import hashlib @@ -12,6 +14,7 @@ from youtube_dl import YoutubeDL from youtube_dl.utils import ( compat_str, preferredencoding, + write_string, ) @@ -40,10 +43,10 @@ def report_warning(message): If stderr is a tty file the 'WARNING:' will be colored ''' if sys.stderr.isatty() and os.name != 'nt': - _msg_header = u'\033[0;33mWARNING:\033[0m' + _msg_header = '\033[0;33mWARNING:\033[0m' else: - _msg_header = u'WARNING:' - output = u'%s %s\n' % (_msg_header, message) + _msg_header = 'WARNING:' + output = '%s %s\n' % (_msg_header, message) if 'b' in getattr(sys.stderr, 'mode', '') or sys.version_info[0] < 3: output = output.encode(preferredencoding()) sys.stderr.write(output) @@ -103,22 +106,22 @@ def expect_info_dict(self, expected_dict, got_dict): self.assertTrue( isinstance(got, compat_str), - u'Expected a %s object, but got %s for field %s' % ( + 'Expected a %s object, but got %s for field %s' % ( compat_str.__name__, type(got).__name__, info_field)) self.assertTrue( match_rex.match(got), - u'field %s (value: %r) should match %r' % (info_field, got, match_str)) + 'field %s (value: %r) should match %r' % (info_field, got, match_str)) elif isinstance(expected, type): got = got_dict.get(info_field) self.assertTrue(isinstance(got, expected), - u'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got))) + 'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got))) else: if isinstance(expected, compat_str) and expected.startswith('md5:'): got = 'md5:' + md5(got_dict.get(info_field)) else: got = got_dict.get(info_field) self.assertEqual(expected, got, - u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) + 'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) # Check for the presence of mandatory fields if got_dict.get('_type') != 'playlist': @@ -126,7 +129,7 @@ def expect_info_dict(self, expected_dict, got_dict): self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key) # Check for mandatory fields that are automatically set by YoutubeDL for key in ['webpage_url', 'extractor', 'extractor_key']: - self.assertTrue(got_dict.get(key), u'Missing field: %s' % key) + self.assertTrue(got_dict.get(key), 'Missing field: %s' % key) # Are checkable fields missing from the test case definition? test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) @@ -134,7 +137,15 @@ def expect_info_dict(self, expected_dict, got_dict): if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location')) missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys()) if missing_keys: - sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n') + def _repr(v): + if isinstance(v, compat_str): + return "'%s'" % v.replace('\\', '\\\\').replace("'", "\\'") + else: + return repr(v) + info_dict_str = ''.join( + ' %s: %s,\n' % (_repr(k), _repr(v)) + for k, v in test_info_dict.items()) + write_string('\n"info_dict": {' + info_dict_str + '}\n', out=sys.stderr) self.assertFalse( missing_keys, 'Missing keys in test definition: %s' % ( diff --git a/test/test_download.py b/test/test_download.py index 2b8ac6975..8178015ea 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -139,7 +139,9 @@ def generator(test_case): if is_playlist: self.assertEqual(res_dict['_type'], 'playlist') + self.assertTrue('entries' in res_dict) expect_info_dict(self, test_case.get('info_dict', {}), res_dict) + if 'playlist_mincount' in test_case: assertGreaterEqual( self, @@ -188,7 +190,7 @@ def generator(test_case): expect_info_dict(self, tc.get('info_dict', {}), info_dict) finally: try_rm_tcs_files() - if is_playlist and res_dict is not None: + if is_playlist and res_dict is not None and res_dict.get('entries'): # Remove all other files that may have been extracted if the # extractor returns full results even with extract_flat res_tcs = [{'info_dict': e} for e in res_dict['entries']] diff --git a/test/test_utils.py b/test/test_utils.py index 3efbed29d..bcca0efea 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -22,7 +22,8 @@ from youtube_dl.utils import ( fix_xml_ampersands, get_meta_content, orderedSet, - PagedList, + OnDemandPagedList, + InAdvancePagedList, parse_duration, read_batch_urls, sanitize_filename, @@ -43,6 +44,7 @@ from youtube_dl.utils import ( limit_length, escape_rfc3986, escape_url, + js_to_json, ) @@ -137,6 +139,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214') self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011') self.assertEqual(unified_strdate('1968-12-10'), '19681210') + self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128') def test_find_xpath_attr(self): testxml = ''' @@ -246,10 +249,14 @@ class TestUtil(unittest.TestCase): for i in range(firstid, upto): yield i - pl = PagedList(get_page, pagesize) + pl = OnDemandPagedList(get_page, pagesize) got = pl.getslice(*sliceargs) self.assertEqual(got, expected) + iapl = InAdvancePagedList(get_page, size // pagesize + 1, pagesize) + got = iapl.getslice(*sliceargs) + self.assertEqual(got, expected) + testPL(5, 2, (), [0, 1, 2, 3, 4]) testPL(5, 2, (1,), [1, 2, 3, 4]) testPL(5, 2, (2,), [2, 3, 4]) @@ -325,5 +332,28 @@ class TestUtil(unittest.TestCase): ) self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') + def test_js_to_json_realworld(self): + inp = '''{ + 'clip':{'provider':'pseudo'} + }''' + self.assertEqual(js_to_json(inp), '''{ + "clip":{"provider":"pseudo"} + }''') + json.loads(js_to_json(inp)) + + inp = '''{ + 'playlist':[{'controls':{'all':null}}] + }''' + self.assertEqual(js_to_json(inp), '''{ + "playlist":[{"controls":{"all":null}}] + }''') + + def test_js_to_json_edgecases(self): + on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}") + self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"}) + + on = js_to_json('{"abc": true}') + self.assertEqual(json.loads(on), {'abc': True}) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 604e76ab6..df2cb09f2 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -47,18 +47,6 @@ _TESTS = [ '2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', 'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', ), - ( - 'http://s.ytimg.com/yts/swfbin/player-vfl5vIhK2/watch_as3.swf', - 'swf', - 86, - 'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVWXY\\!"#$%&\'()*+,-./:;<=>?' - ), - ( - 'http://s.ytimg.com/yts/swfbin/player-vflmDyk47/watch_as3.swf', - 'swf', - 'F375F75BF2AFDAAF2666E43868D46816F83F13E81C46.3725A8218E446A0DECD33F79DC282994D6AA92C92C9', - '9C29AA6D499282CD97F33DCED0A644E8128A5273.64C18E31F38361864D86834E6662FAADFA2FB57F' - ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js', 'js', diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b07c0b4cc..76726305a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -134,13 +134,16 @@ from .gamestar import GameStarIE from .gametrailers import GametrailersIE from .gdcvault import GDCVaultIE from .generic import GenericIE +from .globo import GloboIE from .godtube import GodTubeIE +from .golem import GolemIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE from .goshgay import GoshgayIE from .grooveshark import GroovesharkIE from .hark import HarkIE +from .heise import HeiseIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .hornbunny import HornBunnyIE @@ -188,6 +191,7 @@ from .livestream import ( LivestreamOriginalIE, LivestreamShortenerIE, ) +from .lrt import LRTIE from .lynda import ( LyndaIE, LyndaCourseIE @@ -261,6 +265,7 @@ from .nrk import ( from .ntv import NTVIE from .nytimes import NYTimesIE from .nuvid import NuvidIE +from .oktoberfesttv import OktoberfestTVIE from .ooyala import OoyalaIE from .orf import ( ORFTVthekIE, @@ -271,6 +276,8 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .photobucket import PhotobucketIE +from .planetaplay import PlanetaPlayIE +from .played import PlayedIE from .playfm import PlayFMIE from .playvid import PlayvidIE from .podomatic import PodomaticIE @@ -350,6 +357,7 @@ from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE +from .tapely import TapelyIE from .teachertube import ( TeacherTubeIE, TeacherTubeUserIE, @@ -363,11 +371,15 @@ from .tenplay import TenPlayIE from .testurl import TestURLIE from .tf1 import TF1IE from .theplatform import ThePlatformIE +from .thesixtyone import TheSixtyOneIE from .thisav import ThisAVIE from .tinypic import TinyPicIE from .tlc import TlcIE, TlcDeIE from .tnaflix import TNAFlixIE -from .thvideo import THVideoIE +from .thvideo import ( + THVideoIE, + THVideoPlaylistIE +) from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE @@ -408,11 +420,12 @@ from .videoweed import VideoWeedIE from .vidme import VidmeIE from .vimeo import ( VimeoIE, - VimeoChannelIE, - VimeoUserIE, VimeoAlbumIE, + VimeoChannelIE, VimeoGroupsIE, + VimeoLikesIE, VimeoReviewIE, + VimeoUserIE, VimeoWatchLaterIE, ) from .vimple import VimpleIE diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 7d89f44ee..69f89320c 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -22,8 +22,7 @@ class ABCIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) urls_info_json = self._search_regex( diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 54cec1c2f..8de9c11ea 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,8 +8,6 @@ from ..utils import ( determine_ext, ExtractorError, qualities, - compat_urllib_parse_urlparse, - compat_urllib_parse, int_or_none, parse_duration, unified_strdate, diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 957d35979..c3d02f85e 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -86,11 +86,15 @@ class ArteTVPlus7IE(InfoExtractor): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] + upload_date_str = player_info.get('shootingDate') + if not upload_date_str: + upload_date_str = player_info.get('VDA', '').split(' ')[0] + info_dict = { 'id': player_info['VID'], 'title': player_info['VTI'], 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]), + 'upload_date': unified_strdate(upload_date_str), 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), } diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index c569aa4d2..c13446665 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -15,13 +15,23 @@ class BandcampIE(InfoExtractor): _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P.*)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', - 'file': '1812978515.mp3', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { - "title": "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", - "duration": 9.8485, + 'id': '1812978515', + 'ext': 'mp3', + 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'duration': 9.8485, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' + }, { + 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', + 'md5': '2b68e5851514c20efdff2afc5603b8b4', + 'info_dict': { + 'id': '2650410135', + 'ext': 'mp3', + 'title': 'Lanius (Battle)', + 'uploader': 'Ben Prunty Music', + }, }] def _real_extract(self, url): @@ -59,9 +69,9 @@ class BandcampIE(InfoExtractor): raise ExtractorError('No free songs found') download_link = m_download.group(1) - video_id = re.search( - r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', - webpage, re.MULTILINE | re.DOTALL).group('id') + video_id = self._search_regex( + r'var TralbumData = {.*?id: (?P<id>\d+),?$', + webpage, 'video id', flags=re.MULTILINE | re.DOTALL) download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page') # We get the dictionary of the track from some javascript code diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 4e2960c62..2e277c8c3 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -26,6 +26,8 @@ class BRIE(InfoExtractor): 'title': 'Wenn das Traditions-Theater wackelt', 'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt', 'duration': 34, + 'uploader': 'BR', + 'upload_date': '20140802', } }, { @@ -66,8 +68,7 @@ class BRIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') + display_id = self._match_id(url) page = self._download_webpage(url, display_id) xml_url = self._search_regex( r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 1bfc9f35b..2c0e5eea2 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -4,37 +4,61 @@ import re import json from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_age_limit, +) class BreakIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?break\.com/video/([^/]+)' - _TEST = { + _VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', - 'md5': 'a3513fb1547fba4fb6cfac1bffc6c46b', + 'md5': '33aa4ff477ecd124d18d7b5d23b87ce5', 'info_dict': { 'id': '2468056', 'ext': 'mp4', 'title': 'When Girls Act Like D-Bags', } - } + }, { + 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1).split("-")[-1] - embed_url = 'http://www.break.com/embed/%s' % video_id - webpage = self._download_webpage(embed_url, video_id) - info_json = self._search_regex(r'var embedVars = ({.*})\s*?</script>', - webpage, 'info json', flags=re.DOTALL) - info = json.loads(info_json) - video_url = info['videoUri'] + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://www.break.com/embed/%s' % video_id, video_id) + info = json.loads(self._search_regex( + r'var embedVars = ({.*})\s*?</script>', + webpage, 'info json', flags=re.DOTALL)) + youtube_id = info.get('youtubeId') if youtube_id: return self.url_result(youtube_id, 'Youtube') - final_url = video_url + '?' + info['AuthToken'] + formats = [{ + 'url': media['uri'] + '?' + info['AuthToken'], + 'tbr': media['bitRate'], + 'width': media['width'], + 'height': media['height'], + } for media in info['media']] + + if not formats: + formats.append({ + 'url': info['videoUri'] + }) + + self._sort_formats(formats) + + duration = int_or_none(info.get('videoLengthInSeconds')) + age_limit = parse_age_limit(info.get('audienceRating')) + return { 'id': video_id, - 'url': final_url, 'title': info['contentName'], 'thumbnail': info['thumbUri'], + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, } diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 65c12136a..d4227e6eb 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -35,7 +35,6 @@ class CliphunterIE(InfoExtractor): 'title': 'Fun Jynx Maze solo', 'thumbnail': 're:^https?://.*\.jpg$', 'age_limit': 18, - 'duration': 1317, } } @@ -86,14 +85,11 @@ class CliphunterIE(InfoExtractor): thumbnail = self._search_regex( r"var\s+mov_thumb\s*=\s*'([^']+)';", webpage, 'thumbnail', fatal=False) - duration = int_or_none(self._search_regex( - r'pl_dur\s*=\s*([0-9]+)', webpage, 'duration', fatal=False)) return { 'id': video_id, 'title': video_title, 'formats': formats, - 'duration': duration, 'age_limit': self._rta_search(webpage), 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 60cab6f4e..450c7dfd6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import base64 +import datetime import hashlib import json import netrc @@ -21,6 +22,7 @@ from ..utils import ( clean_html, compiled_regex_type, ExtractorError, + float_or_none, int_or_none, RegexNotFoundError, sanitize_filename, @@ -136,6 +138,8 @@ class InfoExtractor(object): Unless mentioned otherwise, the fields should be Unicode strings. + Unless mentioned otherwise, None is equivalent to absence of information. + Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. @@ -164,6 +168,14 @@ class InfoExtractor(object): cls._VALID_URL_RE = re.compile(cls._VALID_URL) return cls._VALID_URL_RE.match(url) is not None + @classmethod + def _match_id(cls, url): + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + m = cls._VALID_URL_RE.match(url) + assert m + return m.group('id') + @classmethod def working(cls): """Getter method for _WORKING.""" @@ -324,7 +336,11 @@ class InfoExtractor(object): try: return json.loads(json_string) except ValueError as ve: - raise ExtractorError('Failed to download JSON', cause=ve) + errmsg = '%s: Failed to parse JSON ' % video_id + if fatal: + raise ExtractorError(errmsg, cause=ve) + else: + self.report_warning(errmsg + str(ve)) def report_warning(self, msg, video_id=None): idstr = '' if video_id is None else '%s: ' % video_id @@ -705,6 +721,34 @@ class InfoExtractor(object): self._sort_formats(formats) return formats + def _live_title(self, name): + """ Generate the title for a live video """ + now = datetime.datetime.now() + now_str = now.strftime("%Y-%m-%d %H:%M") + return name + ' ' + now_str + + def _int(self, v, name, fatal=False, **kwargs): + res = int_or_none(v, **kwargs) + if 'get_attr' in kwargs: + print(getattr(v, kwargs['get_attr'])) + if res is None: + msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + if fatal: + raise ExtractorError(msg) + else: + self._downloader.report_warning(msg) + return res + + def _float(self, v, name, fatal=False, **kwargs): + res = float_or_none(v, **kwargs) + if res is None: + msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + if fatal: + raise ExtractorError(msg) + else: + self._downloader.report_warning(msg) + return res + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 66a8f16d9..dbcf5d6a7 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -82,11 +82,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): ] def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - + video_id = self._match_id(url) url = 'http://www.dailymotion.com/video/%s' % video_id # Retrieve video webpage to extract further information @@ -147,18 +143,23 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self._list_available_subtitles(video_id, webpage) return - view_count = self._search_regex( - r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, 'view count', fatal=False) - if view_count is not None: - view_count = str_to_int(view_count) + view_count = str_to_int(self._search_regex( + r'video_views_count[^>]+>\s+([\d\.,]+)', + webpage, 'view count', fatal=False)) + + title = self._og_search_title(webpage, default=None) + if title is None: + title = self._html_search_regex( + r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage, + 'title') return { - 'id': video_id, + 'id': video_id, 'formats': formats, 'uploader': info['owner.screenname'], - 'upload_date': video_upload_date, - 'title': self._og_search_title(webpage), - 'subtitles': video_subtitles, + 'upload_date': video_upload_date, + 'title': title, + 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'], 'age_limit': age_limit, 'view_count': view_count, diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 817a9bd61..5f24ac721 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -29,9 +29,8 @@ class DropboxIE(InfoExtractor): video_id = mobj.group('id') fn = compat_urllib_parse_unquote(url_basename(url)) title = os.path.splitext(fn)[0] - video_url = ( - re.sub(r'[?&]dl=0', '', url) + - ('?' if '?' in url else '&') + 'dl=1') + video_url = re.sub(r'[?&]dl=0', '', url) + video_url += ('?' if '?' not in video_url else '&') + 'dl=1' return { 'id': video_id, diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 4ba323148..2cba82532 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -1,4 +1,6 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -7,20 +9,20 @@ from ..utils import ExtractorError class EitbIE(InfoExtractor): - IE_NAME = u'eitb.tv' + IE_NAME = 'eitb.tv' _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)' _TEST = { - u'add_ie': ['Brightcove'], - u'url': u'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', - u'md5': u'edf4436247185adee3ea18ce64c47998', - u'info_dict': { - u'id': u'2743577154001', - u'ext': u'mp4', - u'title': u'60 minutos (Lasa y Zabala, 30 años)', + 'add_ie': ['Brightcove'], + 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', + 'md5': 'edf4436247185adee3ea18ce64c47998', + 'info_dict': { + 'id': '2743577154001', + 'ext': 'mp4', + 'title': '60 minutos (Lasa y Zabala, 30 años)', # All videos from eitb has this description in the brightcove info - u'description': u'.', - u'uploader': u'Euskal Telebista', + 'description': '.', + 'uploader': 'Euskal Telebista', }, } @@ -30,7 +32,7 @@ class EitbIE(InfoExtractor): webpage = self._download_webpage(url, chapter_id) bc_url = BrightcoveIE._extract_brightcove_url(webpage) if bc_url is None: - raise ExtractorError(u'Could not extract the Brightcove url') + raise ExtractorError('Could not extract the Brightcove url') # The BrightcoveExperience object doesn't contain the video id, we set # it manually bc_url += '&%40videoPlayer={0}'.format(chapter_id) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 522aa3d63..bb231ecb1 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -14,11 +14,11 @@ class EpornerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\d+)/(?P<display_id>[\w-]+)' _TEST = { 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', - 'md5': '3b427ae4b9d60619106de3185c2987cd', + 'md5': '39d486f046212d8e1b911c52ab4691f8', 'info_dict': { 'id': '95008', 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Infamous Tiffany Teen Strip Tease Video', 'duration': 194, 'view_count': int, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 60e68d98a..3ad993751 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -35,7 +35,7 @@ class FacebookIE(InfoExtractor): 'id': '637842556329505', 'ext': 'mp4', 'duration': 38, - 'title': 'Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam fin...', + 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', } }, { 'note': 'Video without discernible title', diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 721e5fce0..d966e8403 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,7 +21,7 @@ class FunnyOrDieIE(InfoExtractor): }, }, { 'url': 'http://www.funnyordie.com/embed/e402820827', - 'md5': 'ff4d83318f89776ed0250634cfaa8d36', + 'md5': '29f4c5e5a61ca39dfd7e8348a75d0aad', 'info_dict': { 'id': 'e402820827', 'ext': 'mp4', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 367f930dd..c16da70f1 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -155,7 +155,6 @@ class GenericIE(InfoExtractor): # funnyordie embed { 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', - 'md5': '7cf780be104d40fea7bae52eed4a470e', 'info_dict': { 'id': '18e820ec3f', 'ext': 'mp4', @@ -180,13 +179,13 @@ class GenericIE(InfoExtractor): # Embedded TED video { 'url': 'http://en.support.wordpress.com/videos/ted-talks/', - 'md5': 'deeeabcc1085eb2ba205474e7235a3d5', + 'md5': '65fdff94098e4a607385a60c5177c638', 'info_dict': { - 'id': '981', + 'id': '1969', 'ext': 'mp4', - 'title': 'My web playroom', - 'uploader': 'Ze Frank', - 'description': 'md5:ddb2a40ecd6b6a147e400e535874947b', + 'title': 'Hidden miracles of the natural world', + 'uploader': 'Louie Schwartzberg', + 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, # Embeded Ustream video @@ -226,21 +225,6 @@ class GenericIE(InfoExtractor): 'skip_download': 'Requires rtmpdump' } }, - # smotri embed - { - 'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml', - 'md5': 'ec40048448e9284c9a1de77bb188108b', - 'info_dict': { - 'id': 'v27008541fad', - 'ext': 'mp4', - 'title': 'Крым и Севастополь вошли в состав России', - 'description': 'md5:fae01b61f68984c7bd2fa741e11c3175', - 'duration': 900, - 'upload_date': '20140318', - 'uploader': 'rbctv_2012_4', - 'uploader_id': 'rbctv_2012_4', - }, - }, # Condé Nast embed { 'url': 'http://www.wired.com/2014/04/honda-asimo/', @@ -295,13 +279,13 @@ class GenericIE(InfoExtractor): { 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', 'info_dict': { - 'id': 'jpSGZsgga_I', + 'id': '4vAffPZIT44', 'ext': 'mp4', - 'title': 'Asphalt 8: Airborne - Launch Trailer', + 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!', 'uploader': 'Gameloft', 'uploader_id': 'gameloft', - 'upload_date': '20130821', - 'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a', + 'upload_date': '20140828', + 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4', }, 'params': { 'skip_download': True, @@ -397,12 +381,6 @@ class GenericIE(InfoExtractor): }, ] - def report_download_webpage(self, video_id): - """Report webpage download.""" - if not self._downloader.params.get('test', False): - self._downloader.report_warning('Falling back on generic information extractor.') - super(GenericIE, self).report_download_webpage(video_id) - def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) @@ -502,6 +480,7 @@ class GenericIE(InfoExtractor): url, smuggled_data = unsmuggle_url(url) force_videoid = None + is_intentional = smuggled_data and smuggled_data.get('to_generic') if smuggled_data and 'force_videoid' in smuggled_data: force_videoid = smuggled_data['force_videoid'] video_id = force_videoid @@ -544,6 +523,9 @@ class GenericIE(InfoExtractor): 'upload_date': upload_date, } + if not self._downloader.params.get('test', False) and not is_intentional: + self._downloader.report_warning('Falling back on generic information extractor.') + try: webpage = self._download_webpage(url, video_id) except ValueError: @@ -657,6 +639,16 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( matches, lambda m: unescapeHTML(m[1])) + # Look for embedded Dailymotion playlist player (#3822) + m = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) + if m: + playlists = re.findall( + r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) + if playlists: + return _playlist_from_matches( + playlists, lambda p: '//dailymotion.com/playlist/%s' % p) + # Look for embedded Wistia player match = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py new file mode 100644 index 000000000..77c3ad4fc --- /dev/null +++ b/youtube_dl/extractor/globo.py @@ -0,0 +1,398 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import math + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + compat_str, + compat_chr, + compat_ord, +) + + +class GloboIE(InfoExtractor): + _VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)' + + _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' + _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=2.9.9.50&resource_id=%s' + + _VIDEOID_REGEXES = [ + r'\bdata-video-id="(\d+)"', + r'\bdata-player-videosids="(\d+)"', + r'<div[^>]+\bid="(\d+)"', + ] + + _RESIGN_EXPIRATION = 86400 + + _TESTS = [ + { + 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', + 'md5': '03ebf41cb7ade43581608b7d9b71fab0', + 'info_dict': { + 'id': '3654973', + 'ext': 'mp4', + 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', + 'duration': 251.585, + 'uploader': 'SporTV', + 'uploader_id': 698, + 'like_count': int, + } + }, + { + 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', + 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', + 'info_dict': { + 'id': '3607726', + 'ext': 'mp4', + 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', + 'duration': 103.204, + 'uploader': 'Globo.com', + 'uploader_id': 265, + 'like_count': int, + } + }, + { + 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', + 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', + 'info_dict': { + 'id': '3652183', + 'ext': 'mp4', + 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', + 'duration': 110.711, + 'uploader': 'Rede Globo', + 'uploader_id': 196, + 'like_count': int, + } + }, + ] + + class MD5(): + HEX_FORMAT_LOWERCASE = 0 + HEX_FORMAT_UPPERCASE = 1 + BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = '' + BASE64_PAD_CHARACTER_RFC_COMPLIANCE = '=' + PADDING = '=0xFF01DD' + hexcase = 0 + b64pad = '' + + def __init__(self): + pass + + class JSArray(list): + def __getitem__(self, y): + try: + return list.__getitem__(self, y) + except IndexError: + return 0 + + def __setitem__(self, i, y): + try: + return list.__setitem__(self, i, y) + except IndexError: + self.extend([0] * (i - len(self) + 1)) + self[-1] = y + + @classmethod + def hex_md5(cls, param1): + return cls.rstr2hex(cls.rstr_md5(cls.str2rstr_utf8(param1))) + + @classmethod + def b64_md5(cls, param1, param2=None): + return cls.rstr2b64(cls.rstr_md5(cls.str2rstr_utf8(param1, param2))) + + @classmethod + def any_md5(cls, param1, param2): + return cls.rstr2any(cls.rstr_md5(cls.str2rstr_utf8(param1)), param2) + + @classmethod + def rstr_md5(cls, param1): + return cls.binl2rstr(cls.binl_md5(cls.rstr2binl(param1), len(param1) * 8)) + + @classmethod + def rstr2hex(cls, param1): + _loc_2 = '0123456789ABCDEF' if cls.hexcase else '0123456789abcdef' + _loc_3 = '' + for _loc_5 in range(0, len(param1)): + _loc_4 = compat_ord(param1[_loc_5]) + _loc_3 += _loc_2[_loc_4 >> 4 & 15] + _loc_2[_loc_4 & 15] + return _loc_3 + + @classmethod + def rstr2b64(cls, param1): + _loc_2 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' + _loc_3 = '' + _loc_4 = len(param1) + for _loc_5 in range(0, _loc_4, 3): + _loc_6_1 = compat_ord(param1[_loc_5]) << 16 + _loc_6_2 = compat_ord(param1[_loc_5 + 1]) << 8 if _loc_5 + 1 < _loc_4 else 0 + _loc_6_3 = compat_ord(param1[_loc_5 + 2]) if _loc_5 + 2 < _loc_4 else 0 + _loc_6 = _loc_6_1 | _loc_6_2 | _loc_6_3 + for _loc_7 in range(0, 4): + if _loc_5 * 8 + _loc_7 * 6 > len(param1) * 8: + _loc_3 += cls.b64pad + else: + _loc_3 += _loc_2[_loc_6 >> 6 * (3 - _loc_7) & 63] + return _loc_3 + + @staticmethod + def rstr2any(param1, param2): + _loc_3 = len(param2) + _loc_4 = [] + _loc_9 = [0] * ((len(param1) >> 2) + 1) + for _loc_5 in range(0, len(_loc_9)): + _loc_9[_loc_5] = compat_ord(param1[_loc_5 * 2]) << 8 | compat_ord(param1[_loc_5 * 2 + 1]) + + while len(_loc_9) > 0: + _loc_8 = [] + _loc_7 = 0 + for _loc_5 in range(0, len(_loc_9)): + _loc_7 = (_loc_7 << 16) + _loc_9[_loc_5] + _loc_6 = math.floor(_loc_7 / _loc_3) + _loc_7 -= _loc_6 * _loc_3 + if len(_loc_8) > 0 or _loc_6 > 0: + _loc_8[len(_loc_8)] = _loc_6 + + _loc_4[len(_loc_4)] = _loc_7 + _loc_9 = _loc_8 + + _loc_10 = '' + _loc_5 = len(_loc_4) - 1 + while _loc_5 >= 0: + _loc_10 += param2[_loc_4[_loc_5]] + _loc_5 -= 1 + + return _loc_10 + + @classmethod + def str2rstr_utf8(cls, param1, param2=None): + _loc_3 = '' + _loc_4 = -1 + if not param2: + param2 = cls.PADDING + param1 = param1 + param2[1:9] + while True: + _loc_4 += 1 + if _loc_4 >= len(param1): + break + _loc_5 = compat_ord(param1[_loc_4]) + _loc_6 = compat_ord(param1[_loc_4 + 1]) if _loc_4 + 1 < len(param1) else 0 + if 55296 <= _loc_5 <= 56319 and 56320 <= _loc_6 <= 57343: + _loc_5 = 65536 + ((_loc_5 & 1023) << 10) + (_loc_6 & 1023) + _loc_4 += 1 + if _loc_5 <= 127: + _loc_3 += compat_chr(_loc_5) + continue + if _loc_5 <= 2047: + _loc_3 += compat_chr(192 | _loc_5 >> 6 & 31) + compat_chr(128 | _loc_5 & 63) + continue + if _loc_5 <= 65535: + _loc_3 += compat_chr(224 | _loc_5 >> 12 & 15) + compat_chr(128 | _loc_5 >> 6 & 63) + compat_chr( + 128 | _loc_5 & 63) + continue + if _loc_5 <= 2097151: + _loc_3 += compat_chr(240 | _loc_5 >> 18 & 7) + compat_chr(128 | _loc_5 >> 12 & 63) + compat_chr( + 128 | _loc_5 >> 6 & 63) + compat_chr(128 | _loc_5 & 63) + return _loc_3 + + @staticmethod + def rstr2binl(param1): + _loc_2 = [0] * ((len(param1) >> 2) + 1) + for _loc_3 in range(0, len(_loc_2)): + _loc_2[_loc_3] = 0 + for _loc_3 in range(0, len(param1) * 8, 8): + _loc_2[_loc_3 >> 5] |= (compat_ord(param1[_loc_3 // 8]) & 255) << _loc_3 % 32 + return _loc_2 + + @staticmethod + def binl2rstr(param1): + _loc_2 = '' + for _loc_3 in range(0, len(param1) * 32, 8): + _loc_2 += compat_chr(param1[_loc_3 >> 5] >> _loc_3 % 32 & 255) + return _loc_2 + + @classmethod + def binl_md5(cls, param1, param2): + param1 = cls.JSArray(param1) + param1[param2 >> 5] |= 128 << param2 % 32 + param1[(param2 + 64 >> 9 << 4) + 14] = param2 + _loc_3 = 1732584193 + _loc_4 = -271733879 + _loc_5 = -1732584194 + _loc_6 = 271733878 + for _loc_7 in range(0, len(param1), 16): + _loc_8 = _loc_3 + _loc_9 = _loc_4 + _loc_10 = _loc_5 + _loc_11 = _loc_6 + _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 7, -680876936) + _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 1], 12, -389564586) + _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 17, 606105819) + _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 3], 22, -1044525330) + _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 7, -176418897) + _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 5], 12, 1200080426) + _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 17, -1473231341) + _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 7], 22, -45705983) + _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 7, 1770035416) + _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 9], 12, -1958414417) + _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 17, -42063) + _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 11], 22, -1990404162) + _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 7, 1804603682) + _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 13], 12, -40341101) + _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 17, -1502002290) + _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 15], 22, 1236535329) + _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 5, -165796510) + _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 6], 9, -1069501632) + _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 14, 643717713) + _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 0], 20, -373897302) + _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 5, -701558691) + _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 10], 9, 38016083) + _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 14, -660478335) + _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 4], 20, -405537848) + _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 5, 568446438) + _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 14], 9, -1019803690) + _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 14, -187363961) + _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 8], 20, 1163531501) + _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 5, -1444681467) + _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 2], 9, -51403784) + _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 14, 1735328473) + _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 12], 20, -1926607734) + _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 4, -378558) + _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 8], 11, -2022574463) + _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 16, 1839030562) + _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 14], 23, -35309556) + _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 4, -1530992060) + _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 4], 11, 1272893353) + _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 16, -155497632) + _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 10], 23, -1094730640) + _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 4, 681279174) + _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 0], 11, -358537222) + _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 16, -722521979) + _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 6], 23, 76029189) + _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 4, -640364487) + _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 12], 11, -421815835) + _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 16, 530742520) + _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 2], 23, -995338651) + _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 6, -198630844) + _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 7], 10, 1126891415) + _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 15, -1416354905) + _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 5], 21, -57434055) + _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 6, 1700485571) + _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 3], 10, -1894986606) + _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 15, -1051523) + _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 1], 21, -2054922799) + _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 6, 1873313359) + _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 15], 10, -30611744) + _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 15, -1560198380) + _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 13], 21, 1309151649) + _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 6, -145523070) + _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 11], 10, -1120210379) + _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 15, 718787259) + _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 9], 21, -343485551) + _loc_3 = cls.safe_add(_loc_3, _loc_8) + _loc_4 = cls.safe_add(_loc_4, _loc_9) + _loc_5 = cls.safe_add(_loc_5, _loc_10) + _loc_6 = cls.safe_add(_loc_6, _loc_11) + return [_loc_3, _loc_4, _loc_5, _loc_6] + + @classmethod + def md5_cmn(cls, param1, param2, param3, param4, param5, param6): + return cls.safe_add( + cls.bit_rol(cls.safe_add(cls.safe_add(param2, param1), cls.safe_add(param4, param6)), param5), param3) + + @classmethod + def md5_ff(cls, param1, param2, param3, param4, param5, param6, param7): + return cls.md5_cmn(param2 & param3 | ~param2 & param4, param1, param2, param5, param6, param7) + + @classmethod + def md5_gg(cls, param1, param2, param3, param4, param5, param6, param7): + return cls.md5_cmn(param2 & param4 | param3 & ~param4, param1, param2, param5, param6, param7) + + @classmethod + def md5_hh(cls, param1, param2, param3, param4, param5, param6, param7): + return cls.md5_cmn(param2 ^ param3 ^ param4, param1, param2, param5, param6, param7) + + @classmethod + def md5_ii(cls, param1, param2, param3, param4, param5, param6, param7): + return cls.md5_cmn(param3 ^ (param2 | ~param4), param1, param2, param5, param6, param7) + + @classmethod + def safe_add(cls, param1, param2): + _loc_3 = (param1 & 65535) + (param2 & 65535) + _loc_4 = (param1 >> 16) + (param2 >> 16) + (_loc_3 >> 16) + return cls.lshift(_loc_4, 16) | _loc_3 & 65535 + + @classmethod + def bit_rol(cls, param1, param2): + return cls.lshift(param1, param2) | (param1 & 0xFFFFFFFF) >> (32 - param2) + + @staticmethod + def lshift(value, count): + r = (0xFFFFFFFF & value) << count + return -(~(r - 1) & 0xFFFFFFFF) if r > 0x7FFFFFFF else r + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id') + + video = self._download_json( + self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] + + title = video['title'] + duration = float_or_none(video['duration'], 1000) + like_count = video['likes'] + uploader = video['channel'] + uploader_id = video['channel_id'] + + formats = [] + + for resource in video['resources']: + resource_id = resource.get('_id') + if not resource_id: + continue + + security = self._download_json( + self._SECURITY_URL_TEMPLATE % (video_id, resource_id), + video_id, 'Downloading security hash for %s' % resource_id) + + security_hash = security.get('hash') + if not security_hash: + message = security.get('message') + if message: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, message), expected=True) + continue + + hash_code = security_hash[:2] + received_time = int(security_hash[2:12]) + received_random = security_hash[12:22] + received_md5 = security_hash[22:] + + sign_time = received_time + self._RESIGN_EXPIRATION + padding = '%010d' % random.randint(1, 10000000000) + + signed_md5 = self.MD5.b64_md5(received_md5 + compat_str(sign_time) + padding) + signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5 + + formats.append({ + 'url': '%s?h=%s&k=%s' % (resource['url'], signed_hash, 'flash'), + 'format_id': resource_id, + 'height': resource['height'] + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'like_count': like_count, + 'formats': formats + } \ No newline at end of file diff --git a/youtube_dl/extractor/godtube.py b/youtube_dl/extractor/godtube.py index 73bd6d890..363dc6608 100644 --- a/youtube_dl/extractor/godtube.py +++ b/youtube_dl/extractor/godtube.py @@ -36,16 +36,16 @@ class GodTubeIE(InfoExtractor): 'http://www.godtube.com/resource/mediaplayer/%s.xml' % video_id.lower(), video_id, 'Downloading player config XML') - video_url = config.find('.//file').text - uploader = config.find('.//author').text - timestamp = parse_iso8601(config.find('.//date').text) - duration = parse_duration(config.find('.//duration').text) - thumbnail = config.find('.//image').text + video_url = config.find('file').text + uploader = config.find('author').text + timestamp = parse_iso8601(config.find('date').text) + duration = parse_duration(config.find('duration').text) + thumbnail = config.find('image').text media = self._download_xml( 'http://www.godtube.com/media/xml/?v=%s' % video_id, video_id, 'Downloading media XML') - title = media.find('.//title').text + title = media.find('title').text return { 'id': video_id, diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py new file mode 100644 index 000000000..53714f47f --- /dev/null +++ b/youtube_dl/extractor/golem.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + determine_ext, +) + + +class GolemIE(InfoExtractor): + _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/' + _TEST = { + 'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html', + 'md5': 'c1a2c0a3c863319651c7c992c5ee29bf', + 'info_dict': { + 'id': '14095', + 'format_id': 'high', + 'ext': 'mp4', + 'title': 'iPhone 6 und 6 Plus - Test', + 'duration': 300.44, + 'filesize': 65309548, + } + } + + _PREFIX = 'http://video.golem.de' + + def _real_extract(self, url): + video_id = self._match_id(url) + + config = self._download_xml( + 'https://video.golem.de/xml/{0}.xml'.format(video_id), video_id) + + info = { + 'id': video_id, + 'title': config.findtext('./title', 'golem'), + 'duration': self._float(config.findtext('./playtime'), 'duration'), + } + + formats = [] + for e in config: + url = e.findtext('./url') + if not url: + continue + + formats.append({ + 'format_id': e.tag, + 'url': compat_urlparse.urljoin(self._PREFIX, url), + 'height': self._int(e.get('height'), 'height'), + 'width': self._int(e.get('width'), 'width'), + 'filesize': self._int(e.findtext('filesize'), 'filesize'), + 'ext': determine_ext(e.findtext('./filename')), + }) + self._sort_formats(formats) + info['formats'] = formats + + thumbnails = [] + for e in config.findall('.//teaser'): + url = e.findtext('./url') + if not url: + continue + thumbnails.append({ + 'url': compat_urlparse.urljoin(self._PREFIX, url), + 'width': self._int(e.get('width'), 'thumbnail width'), + 'height': self._int(e.get('height'), 'thumbnail height'), + }) + info['thumbnails'] = thumbnails + + return info diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py index ca5f7c417..45cca1d24 100644 --- a/youtube_dl/extractor/gorillavid.py +++ b/youtube_dl/extractor/gorillavid.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( + ExtractorError, determine_ext, compat_urllib_parse, compat_urllib_request, @@ -12,20 +13,22 @@ from ..utils import ( class GorillaVidIE(InfoExtractor): - IE_DESC = 'GorillaVid.in and daclips.in' + IE_DESC = 'GorillaVid.in, daclips.in and movpod.in' _VALID_URL = r'''(?x) https?://(?P<host>(?:www\.)? - (?:daclips\.in|gorillavid\.in))/ + (?:daclips\.in|gorillavid\.in|movpod\.in))/ (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)? ''' + _FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<' + _TESTS = [{ 'url': 'http://gorillavid.in/06y9juieqpmi', 'md5': '5ae4a3580620380619678ee4875893ba', 'info_dict': { 'id': '06y9juieqpmi', 'ext': 'flv', - 'title': 'Rebecca Black My Moment Official Music Video Reaction', + 'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ', 'thumbnail': 're:http://.*\.jpg', }, }, { @@ -46,6 +49,9 @@ class GorillaVidIE(InfoExtractor): 'title': 'Micro Pig piglets ready on 16th July 2009', 'thumbnail': 're:http://.*\.jpg', }, + }, { + 'url': 'http://movpod.in/0wguyyxi1yca', + 'only_matching': True, }] def _real_extract(self, url): @@ -54,6 +60,9 @@ class GorillaVidIE(InfoExtractor): webpage = self._download_webpage('http://%s/%s' % (mobj.group('host'), video_id), video_id) + if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None: + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + fields = dict(re.findall(r'''(?x)<input\s+ type="hidden"\s+ name="([^"]+)"\s+ @@ -69,14 +78,14 @@ class GorillaVidIE(InfoExtractor): webpage = self._download_webpage(req, video_id, 'Downloading video page') - title = self._search_regex(r'style="z-index: [0-9]+;">([0-9a-zA-Z ]+)(?:-.+)?</span>', webpage, 'title') - thumbnail = self._search_regex(r'image:\'(http[^\']+)\',', webpage, 'thumbnail') - url = self._search_regex(r'file: \'(http[^\']+)\',', webpage, 'file url') + title = self._search_regex(r'style="z-index: [0-9]+;">([^<]+)</span>', webpage, 'title') + video_url = self._search_regex(r'file\s*:\s*\'(http[^\']+)\',', webpage, 'file url') + thumbnail = self._search_regex(r'image\s*:\s*\'(http[^\']+)\',', webpage, 'thumbnail', fatal=False) formats = [{ 'format_id': 'sd', - 'url': url, - 'ext': determine_ext(url), + 'url': video_url, + 'ext': determine_ext(video_url), 'quality': 1, }] diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py new file mode 100644 index 000000000..f97b1e085 --- /dev/null +++ b/youtube_dl/extractor/heise.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + get_meta_content, + parse_iso8601, +) + + +class HeiseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?heise\.de/video/artikel/ + .+?(?P<id>[0-9]+)\.html(?:$|[?#]) + ''' + _TEST = { + 'url': ( + 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html' + ), + 'md5': 'ffed432483e922e88545ad9f2f15d30e', + 'info_dict': { + 'id': '2404147', + 'ext': 'mp4', + 'title': ( + "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone" + ), + 'format_id': 'mp4_720', + 'timestamp': 1411812600, + 'upload_date': '20140927', + 'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + json_url = self._search_regex( + r'json_url:\s*"([^"]+)"', webpage, 'json URL') + config = self._download_json(json_url, video_id) + + info = { + 'id': video_id, + 'thumbnail': config.get('poster'), + 'timestamp': parse_iso8601(get_meta_content('date', webpage)), + 'description': self._og_search_description(webpage), + } + + title = get_meta_content('fulltitle', webpage) + if title: + info['title'] = title + elif config.get('title'): + info['title'] = config['title'] + else: + info['title'] = self._og_search_title(webpage) + + formats = [] + for t, rs in config['formats'].items(): + if not rs or not hasattr(rs, 'items'): + self._downloader.report_warning( + 'formats: {0}: no resolutions'.format(t)) + continue + + for height_str, obj in rs.items(): + format_id = '{0}_{1}'.format(t, height_str) + + if not obj or not obj.get('url'): + self._downloader.report_warning( + 'formats: {0}: no url'.format(format_id)) + continue + + formats.append({ + 'url': obj['url'], + 'format_id': format_id, + 'height': self._int(height_str, 'height'), + }) + + self._sort_formats(formats) + info['formats'] = formats + + return info diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 12e9e61c4..c80185b53 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -89,7 +89,12 @@ class IGNIE(InfoExtractor): '<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', webpage) if multiple_urls: - return [self.url_result(u, ie='IGN') for u in multiple_urls] + entries = [self.url_result(u, ie='IGN') for u in multiple_urls] + return { + '_type': 'playlist', + 'id': name_or_id, + 'entries': entries, + } video_id = self._find_video_id(webpage) result = self._get_video_info(video_id) diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 4ddda2f1b..53f9a5f75 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -12,12 +14,13 @@ class InternetVideoArchiveIE(InfoExtractor): _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' _TEST = { - u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', - u'file': u'452693.mp4', - u'info_dict': { - u'title': u'SKYFALL', - u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', - u'duration': 153, + 'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', + 'info_dict': { + 'id': '452693', + 'ext': 'mp4', + 'title': 'SKYFALL', + 'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', + 'duration': 149, }, } @@ -42,7 +45,7 @@ class InternetVideoArchiveIE(InfoExtractor): url = self._build_url(query) flashconfiguration = self._download_xml(url, video_id, - u'Downloading flash configuration') + 'Downloading flash configuration') file_url = flashconfiguration.find('file').text file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') # Replace some of the parameters in the query to get the best quality @@ -51,7 +54,7 @@ class InternetVideoArchiveIE(InfoExtractor): lambda m: self._clean_query(m.group()), file_url) info = self._download_xml(file_url, video_id, - u'Downloading video info') + 'Downloading video info') item = info.find('channel/item') def _bp(p): diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index a83dd249f..07ef682ee 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -63,7 +63,8 @@ class IzleseneIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + thumbnail = self._proto_relative_url( + self._og_search_thumbnail(webpage), scheme='http:') uploader = self._html_search_regex( r"adduserUsername\s*=\s*'([^']+)';", diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py index aad782578..122e2dd8c 100644 --- a/youtube_dl/extractor/jpopsukitv.py +++ b/youtube_dl/extractor/jpopsukitv.py @@ -1,8 +1,6 @@ # coding=utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -12,14 +10,14 @@ from ..utils import ( class JpopsukiIE(InfoExtractor): IE_NAME = 'jpopsuki.tv' - _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/video/(.*?)/(?P<id>\S+)' + _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/(?:category/)?video/[^/]+/(?P<id>\S+)' _TEST = { 'url': 'http://www.jpopsuki.tv/video/ayumi-hamasaki---evolution/00be659d23b0b40508169cdee4545771', 'md5': '88018c0c1a9b1387940e90ec9e7e198e', - 'file': '00be659d23b0b40508169cdee4545771.mp4', 'info_dict': { 'id': '00be659d23b0b40508169cdee4545771', + 'ext': 'mp4', 'title': 'ayumi hamasaki - evolution', 'description': 'Release date: 2001.01.31\r\n浜崎あゆみ - evolution', 'thumbnail': 'http://www.jpopsuki.tv/cache/89722c74d2a2ebe58bcac65321c115b2.jpg', @@ -30,8 +28,7 @@ class JpopsukiIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -47,11 +44,9 @@ class JpopsukiIE(InfoExtractor): uploader_id = self._html_search_regex( r'<li>from: <a href="/user/view/user/\S*?/uid/(\d*)', webpage, 'video uploader_id', fatal=False) - upload_date = self._html_search_regex( + upload_date = unified_strdate(self._html_search_regex( r'<li>uploaded: (.*?)</li>', webpage, 'video upload_date', - fatal=False) - if upload_date is not None: - upload_date = unified_strdate(upload_date) + fatal=False)) view_count_str = self._html_search_regex( r'<li>Hits: ([0-9]+?)</li>', webpage, 'video view_count', fatal=False) diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py index 9b553b9fa..5aa32bf09 100644 --- a/youtube_dl/extractor/jukebox.py +++ b/youtube_dl/extractor/jukebox.py @@ -11,10 +11,9 @@ from ..utils import ( class JukeboxIE(InfoExtractor): - _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html' + _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html' _TEST = { 'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html', - 'md5': '1574e9b4d6438446d5b7dbcdf2786276', 'info_dict': { 'id': 'r303r', 'ext': 'flv', @@ -24,8 +23,7 @@ class JukeboxIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') + video_id = self._match_id(url) html = self._download_webpage(url, video_id) iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url')) diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py new file mode 100644 index 000000000..fca0bfef0 --- /dev/null +++ b/youtube_dl/extractor/lrt.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, + parse_duration, + remove_end, +) + + +class LRTIE(InfoExtractor): + IE_NAME = 'lrt.lt' + _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', + 'info_dict': { + 'id': '54391', + 'ext': 'mp4', + 'title': 'Septynios Kauno dienos', + 'description': 'Kauno miesto ir apskrities naujienos', + 'duration': 1783, + }, + 'params': { + 'skip_download': True, # HLS download + }, + + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + title = remove_end(self._og_search_title(webpage), ' - LRT') + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + duration = parse_duration(self._search_regex( + r"'duration':\s*'([^']+)',", webpage, + 'duration', fatal=False, default=None)) + + formats = [] + for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage): + data = json.loads(js_to_json(js)) + if data['provider'] == 'rtmp': + formats.append({ + 'format_id': 'rtmp', + 'ext': determine_ext(data['file']), + 'url': data['streamer'], + 'play_path': 'mp4:%s' % data['file'], + 'preference': -1, + }) + else: + formats.extend( + self._extract_m3u8_formats(data['file'], video_id, 'mp4')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'description': description, + 'duration': duration, + } diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py index 3a938861b..c7f6beb9c 100644 --- a/youtube_dl/extractor/muenchentv.py +++ b/youtube_dl/extractor/muenchentv.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import json from .common import InfoExtractor @@ -23,6 +22,7 @@ class MuenchenTVIE(InfoExtractor): 'ext': 'mp4', 'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'thumbnail': 're:^https?://.*\.jpg$' }, 'params': { 'skip_download': True, @@ -33,9 +33,7 @@ class MuenchenTVIE(InfoExtractor): display_id = 'live' webpage = self._download_webpage(url, display_id) - now = datetime.datetime.now() - now_str = now.strftime("%Y-%m-%d %H:%M") - title = self._og_search_title(webpage) + ' ' + now_str + title = self._live_title(self._og_search_title(webpage)) data_js = self._search_regex( r'(?s)\nplaylist:\s*(\[.*?}\]),related:', @@ -73,5 +71,6 @@ class MuenchenTVIE(InfoExtractor): 'title': title, 'formats': formats, 'is_live': True, + 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 963c4587c..cc7c921c3 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, + compat_urllib_parse_urlparse, int_or_none, remove_end, ) @@ -13,76 +14,116 @@ from ..utils import ( class NFLIE(InfoExtractor): IE_NAME = 'nfl.com' - _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)' - _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json' - _TEST = { - 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', - # 'md5': '5eb8c40a727dda106d510e5d6ffa79e5', # md5 checksum fluctuates - 'info_dict': { - 'id': '0ap3000000398478', - 'ext': 'mp4', - 'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights', - 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', - 'upload_date': '20140921', - 'timestamp': 1411337580, - 'thumbnail': 're:^https?://.*\.jpg$', + _VALID_URL = r'''(?x)https?:// + (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/ + (?:.+?/)* + (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))''' + _TESTS = [ + { + 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', + 'md5': '394ef771ddcd1354f665b471d78ec4c6', + 'info_dict': { + 'id': '0ap3000000398478', + 'ext': 'mp4', + 'title': 'Week 3: Redskins vs. Eagles highlights', + 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', + 'upload_date': '20140921', + 'timestamp': 1411337580, + 'thumbnail': 're:^https?://.*\.jpg$', + } + }, + { + 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266', + 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c', + 'info_dict': { + 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266', + 'ext': 'mp4', + 'title': 'LIVE: Post Game vs. Browns', + 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8', + 'upload_date': '20131229', + 'timestamp': 1388354455, + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + ] + + @staticmethod + def prepend_host(host, url): + if not url.startswith('http'): + if not url.startswith('/'): + url = '/%s' % url + url = 'http://{0:}{1:}'.format(host, url) + return url + + @staticmethod + def format_from_stream(stream, protocol, host, path_prefix='', + preference=0, note=None): + url = '{protocol:}://{host:}/{prefix:}{path:}'.format( + protocol=protocol, + host=host, + prefix=path_prefix, + path=stream.get('path'), + ) + return { + 'url': url, + 'vbr': int_or_none(stream.get('rate', 0), 1000), + 'preference': preference, + 'format_note': note, } - } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id, host = mobj.group('id'), mobj.group('host') - config = self._download_json(self._PLAYER_CONFIG_URL, video_id, - note='Downloading player config') - url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config) - video_data = self._download_json(url_template.format(id=video_id), video_id) + webpage = self._download_webpage(url, video_id) - cdns = config.get('cdns') - if not cdns: - raise ExtractorError('Failed to get CDN data', expected=True) + config_url = NFLIE.prepend_host(host, self._search_regex( + r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL')) + config = self._download_json(config_url, video_id, + note='Downloading player config') + url_template = NFLIE.prepend_host( + host, '{contentURLTemplate:}'.format(**config)) + video_data = self._download_json( + url_template.format(id=video_id), video_id) formats = [] - streams = video_data.get('cdnData', {}).get('bitrateInfo', []) - for name, cdn in cdns.items(): - # LimeLight streams don't seem to work - if cdn.get('name') == 'LIMELIGHT': - continue - - protocol = cdn.get('protocol') - host = remove_end(cdn.get('host', ''), '/') - if not (protocol and host): - continue - - path_prefix = cdn.get('pathprefix', '') - if path_prefix and not path_prefix.endswith('/'): - path_prefix = '%s/' % path_prefix - - get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format( - protocol=protocol, - host=host, - prefix=path_prefix, - path=p, - ) - - if protocol == 'rtmp': - preference = -2 - elif 'prog' in name.lower(): - preference = -1 - else: - preference = 0 - + cdn_data = video_data.get('cdnData', {}) + streams = cdn_data.get('bitrateInfo', []) + if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM': + parts = compat_urllib_parse_urlparse(cdn_data.get('uri')) + protocol, host = parts.scheme, parts.netloc for stream in streams: - path = stream.get('path') - if not path: + formats.append( + NFLIE.format_from_stream(stream, protocol, host)) + else: + cdns = config.get('cdns') + if not cdns: + raise ExtractorError('Failed to get CDN data', expected=True) + + for name, cdn in cdns.items(): + # LimeLight streams don't seem to work + if cdn.get('name') == 'LIMELIGHT': continue - formats.append({ - 'url': get_url(path), - 'vbr': int_or_none(stream.get('rate', 0), 1000), - 'preference': preference, - 'format_note': name, - }) + protocol = cdn.get('protocol') + host = remove_end(cdn.get('host', ''), '/') + if not (protocol and host): + continue + + prefix = cdn.get('pathprefix', '') + if prefix and not prefix.endswith('/'): + prefix = '%s/' % prefix + + preference = 0 + if protocol == 'rtmp': + preference = -2 + elif 'prog' in name.lower(): + preference = 1 + + for stream in streams: + formats.append( + NFLIE.format_from_stream(stream, protocol, host, + prefix, preference, name)) self._sort_formats(formats) @@ -94,7 +135,7 @@ class NFLIE(InfoExtractor): return { 'id': video_id, - 'title': video_data.get('storyHeadline'), + 'title': video_data.get('headline'), 'formats': formats, 'description': video_data.get('caption'), 'duration': video_data.get('duration'), diff --git a/youtube_dl/extractor/oktoberfesttv.py b/youtube_dl/extractor/oktoberfesttv.py new file mode 100644 index 000000000..4a41c0542 --- /dev/null +++ b/youtube_dl/extractor/oktoberfesttv.py @@ -0,0 +1,47 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class OktoberfestTVIE(InfoExtractor): + _VALID_URL = r'https?://www\.oktoberfest-tv\.de/[^/]+/[^/]+/video/(?P<id>[^/?#]+)' + + _TEST = { + 'url': 'http://www.oktoberfest-tv.de/de/kameras/video/hb-zelt', + 'info_dict': { + 'id': 'hb-zelt', + 'ext': 'mp4', + 'title': 're:^Live-Kamera: Hofbräuzelt [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': 're:^https?://.*\.jpg$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._live_title(self._html_search_regex( + r'<h1><strong>.*?</strong>(.*?)</h1>', webpage, 'title')) + + clip = self._search_regex( + r"clip:\s*\{\s*url:\s*'([^']+)'", webpage, 'clip') + ncurl = self._search_regex( + r"netConnectionUrl:\s*'([^']+)'", webpage, 'rtmp base') + video_url = ncurl + clip + thumbnail = self._search_regex( + r"canvas:\s*\{\s*backgroundImage:\s*'url\(([^)]+)\)'", webpage, + 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'ext': 'mp4', + 'is_live': True, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 2adfde909..8f140d626 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + unified_strdate, US_RATINGS, ) @@ -11,10 +12,10 @@ from ..utils import ( class PBSIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?: - # Direct video URL - video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | - # Article with embedded player - (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) | + # Direct video URL + video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | + # Article with embedded player (or direct video) + (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ ) @@ -65,10 +66,25 @@ class PBSIE(InfoExtractor): 'duration': 6559, 'thumbnail': 're:^https?://.*\.jpg$', } + }, + { + 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', + 'md5': '908f3e5473a693b266b84e25e1cf9703', + 'info_dict': { + 'id': '2365160389', + 'display_id': 'killer-typhoon', + 'ext': 'mp4', + 'description': 'md5:c741d14e979fc53228c575894094f157', + 'title': 'Killer Typhoon', + 'duration': 3172, + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20140122', + } } + ] - def _extract_ids(self, url): + def _extract_webpage(self, url): mobj = re.match(self._VALID_URL, url) presumptive_id = mobj.group('presumptive_id') @@ -76,15 +92,20 @@ class PBSIE(InfoExtractor): if presumptive_id: webpage = self._download_webpage(url, display_id) + upload_date = unified_strdate(self._search_regex( + r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"', + webpage, 'upload date', default=None)) + MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer + r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer ] media_id = self._search_regex( MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None) if media_id: - return media_id, presumptive_id + return media_id, presumptive_id, upload_date url = self._search_regex( r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>', @@ -104,10 +125,10 @@ class PBSIE(InfoExtractor): video_id = mobj.group('id') display_id = video_id - return video_id, display_id + return video_id, display_id, None def _real_extract(self, url): - video_id, display_id = self._extract_ids(url) + video_id, display_id, upload_date = self._extract_webpage(url) info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id info = self._download_json(info_url, display_id) @@ -119,6 +140,7 @@ class PBSIE(InfoExtractor): return { 'id': video_id, + 'display_id': display_id, 'title': info['title'], 'url': info['alternate_encoding']['url'], 'ext': 'mp4', @@ -126,4 +148,5 @@ class PBSIE(InfoExtractor): 'thumbnail': info.get('image_url'), 'duration': info.get('duration'), 'age_limit': age_limit, + 'upload_date': upload_date, } diff --git a/youtube_dl/extractor/planetaplay.py b/youtube_dl/extractor/planetaplay.py new file mode 100644 index 000000000..596c621d7 --- /dev/null +++ b/youtube_dl/extractor/planetaplay.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class PlanetaPlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?planetaplay\.com/\?sng=(?P<id>[0-9]+)' + _API_URL = 'http://planetaplay.com/action/playlist/?sng={0:}' + _THUMBNAIL_URL = 'http://planetaplay.com/img/thumb/{thumb:}' + _TEST = { + 'url': 'http://planetaplay.com/?sng=3586', + 'md5': '9d569dceb7251a4e01355d5aea60f9db', + 'info_dict': { + 'id': '3586', + 'ext': 'flv', + 'title': 'md5:e829428ee28b1deed00de90de49d1da1', + } + } + + _SONG_FORMATS = { + 'lq': (0, 'http://www.planetaplay.com/videoplayback/{med_hash:}'), + 'hq': (1, 'http://www.planetaplay.com/videoplayback/hi/{med_hash:}'), + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + response = self._download_json( + self._API_URL.format(video_id), video_id)['response'] + try: + data = response.get('data')[0] + except IndexError: + raise ExtractorError( + '%s: failed to get the playlist' % self.IE_NAME, expected=True) + + title = '{song_artists:} - {sng_name:}'.format(**data) + thumbnail = self._THUMBNAIL_URL.format(**data) + + formats = [] + for format_id, (quality, url_template) in self._SONG_FORMATS.items(): + formats.append({ + 'format_id': format_id, + 'url': url_template.format(**data), + 'quality': quality, + 'ext': 'flv', + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py new file mode 100644 index 000000000..645a1e06d --- /dev/null +++ b/youtube_dl/extractor/played.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import os.path + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + compat_urllib_request, +) + + +class PlayedIE(InfoExtractor): + IE_NAME = 'played.to' + _VALID_URL = r'https?://(?:www\.)?played\.to/(?P<id>[a-zA-Z0-9_-]+)' + + _TEST = { + 'url': 'http://played.to/j2f2sfiiukgt', + 'md5': 'c2bd75a368e82980e7257bf500c00637', + 'info_dict': { + 'id': 'j2f2sfiiukgt', + 'ext': 'flv', + 'title': 'youtube-dl_test_video.mp4', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + orig_webpage = self._download_webpage(url, video_id) + fields = re.findall( + r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage) + data = dict(fields) + + self._sleep(2, video_id) + + post = compat_urllib_parse.urlencode(data) + headers = { + b'Content-Type': b'application/x-www-form-urlencoded', + } + req = compat_urllib_request.Request(url, post, headers) + webpage = self._download_webpage( + req, video_id, note='Downloading video page ...') + + title = os.path.splitext(data['fname'])[0] + + video_url = self._search_regex( + r'file: "?(.+?)",', webpage, 'video URL') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + } diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 5b2a723c1..619496de7 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -144,7 +144,7 @@ class ProSiebenSat1IE(InfoExtractor): 'id': '2156342', 'ext': 'mp4', 'title': 'Kurztrips zum Valentinstag', - 'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528', + 'description': 'Romantischer Kurztrip zum Valentinstag? Wir verraten, was sich hier wirklich lohnt.', 'duration': 307.24, }, 'params': { @@ -180,12 +180,10 @@ class ProSiebenSat1IE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - page = self._download_webpage(url, video_id, 'Downloading page') - - clip_id = self._html_search_regex(self._CLIPID_REGEXES, page, 'clip id') + clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') access_token = 'testclient' client_name = 'kolibri-1.2.5' @@ -234,12 +232,12 @@ class ProSiebenSat1IE(InfoExtractor): urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON') - title = self._html_search_regex(self._TITLE_REGEXES, page, 'title') - description = self._html_search_regex(self._DESCRIPTION_REGEXES, page, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(page) + title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') + description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._html_search_regex( - self._UPLOAD_DATE_REGEXES, page, 'upload date', default=None)) + self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) formats = [] diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 2007a0013..94602e89e 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -9,7 +9,6 @@ from ..utils import ( compat_urllib_parse, unified_strdate, str_to_int, - int_or_none, ) from ..aes import aes_decrypt_text @@ -40,31 +39,42 @@ class SpankwireIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title') + title = self._html_search_regex( + r'<h1>([^<]+)', webpage, 'title') description = self._html_search_regex( - r'<div\s+id="descriptionContent">([^<]+)<', webpage, 'description', fatal=False) + r'<div\s+id="descriptionContent">([^<]+)<', + webpage, 'description', fatal=False) thumbnail = self._html_search_regex( - r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False) + r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', + webpage, 'thumbnail', fatal=False) uploader = self._html_search_regex( - r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False) + r'by:\s*<a [^>]*>(.+?)</a>', + webpage, 'uploader', fatal=False) uploader_id = self._html_search_regex( - r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', webpage, 'uploader id', fatal=False) - upload_date = self._html_search_regex(r'</a> on (.+?) at \d+:\d+', webpage, 'upload date', fatal=False) - if upload_date: - upload_date = unified_strdate(upload_date) - - view_count = self._html_search_regex( - r'<div id="viewsCounter"><span>([^<]+)</span> views</div>', webpage, 'view count', fatal=False) - if view_count: - view_count = str_to_int(view_count) - comment_count = int_or_none(self._html_search_regex( - r'<span id="spCommentCount">\s*(\d+)</span> Comments</div>', webpage, 'comment count', fatal=False)) + r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', + webpage, 'uploader id', fatal=False) + upload_date = unified_strdate(self._html_search_regex( + r'</a> on (.+?) at \d+:\d+', + webpage, 'upload date', fatal=False)) - video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage))) + view_count = str_to_int(self._html_search_regex( + r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>', + webpage, 'view count', fatal=False)) + comment_count = str_to_int(self._html_search_regex( + r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>', + webpage, 'comment count', fatal=False)) + + video_urls = list(map( + compat_urllib_parse.unquote, + re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage))) if webpage.find('flashvars\.encrypted = "true"') != -1: - password = self._html_search_regex(r'flashvars\.video_title = "([^"]+)', webpage, 'password').replace('+', ' ') - video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) + password = self._html_search_regex( + r'flashvars\.video_title = "([^"]+)', + webpage, 'password').replace('+', ' ') + video_urls = list(map( + lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), + video_urls)) formats = [] for video_url in video_urls: diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 185353bef..abb827783 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -17,11 +17,11 @@ class SportDeutschlandIE(InfoExtractor): 'info_dict': { 'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen', 'ext': 'mp4', - 'title': 'LIVE: Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', + 'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', 'categories': ['Badminton'], 'view_count': int, 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 're:^Die Badminton-WM 2014 aus Kopenhagen LIVE', + 'description': 're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV', 'timestamp': int, 'upload_date': 're:^201408[23][0-9]$', }, diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index 7de3c9dd5..263f09b46 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -39,10 +39,10 @@ class SunPornoIE(InfoExtractor): r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) duration = parse_duration(self._search_regex( - r'<span>Duration: (\d+:\d+)</span>', webpage, 'duration', fatal=False)) + r'Duration:\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( - r'<span class="views">(\d+)</span>', webpage, 'view count', fatal=False)) + r'class="views">\s*(\d+)\s*<', webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( r'(\d+)</b> Comments?', webpage, 'comment count', fatal=False)) diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py new file mode 100644 index 000000000..77e056242 --- /dev/null +++ b/youtube_dl/extractor/tapely.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + compat_urllib_request, + float_or_none, + parse_iso8601, +) + + +class TapelyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tape\.ly/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?' + _API_URL = 'http://tape.ly/showtape?id={0:}' + _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}' + _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}' + _TESTS = [ + { + 'url': 'http://tape.ly/my-grief-as-told-by-water', + 'info_dict': { + 'id': 23952, + 'title': 'my grief as told by water', + 'thumbnail': 're:^https?://.*\.png$', + 'uploader_id': 16484, + 'timestamp': 1411848286, + 'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.', + }, + 'playlist_count': 13, + }, + { + 'url': 'http://tape.ly/my-grief-as-told-by-water/1', + 'md5': '79031f459fdec6530663b854cbc5715c', + 'info_dict': { + 'id': 258464, + 'title': 'Dreaming Awake (My Brightest Diamond)', + 'ext': 'm4a', + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + + playlist_url = self._API_URL.format(display_id) + request = compat_urllib_request.Request(playlist_url) + request.add_header('X-Requested-With', 'XMLHttpRequest') + request.add_header('Accept', 'application/json') + + playlist = self._download_json(request, display_id) + + tape = playlist['tape'] + + entries = [] + for s in tape['songs']: + song = s['song'] + entry = { + 'id': song['id'], + 'duration': float_or_none(song.get('songduration'), 1000), + 'title': song['title'], + } + if song['source'] == 'S3': + entry.update({ + 'url': self._S3_SONG_URL.format(song['filename']), + }) + entries.append(entry) + elif song['source'] == 'YT': + self.to_screen('YouTube video detected') + yt_id = song['filename'].replace('/youtube/', '') + entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id)) + entries.append(entry) + elif song['source'] == 'SC': + self.to_screen('SoundCloud song detected') + sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename']) + entry.update(self.url_result(sc_url, 'Soundcloud')) + entries.append(entry) + else: + self.report_warning('Unknown song source: %s' % song['source']) + + if mobj.group('songnr'): + songnr = int(mobj.group('songnr')) - 1 + try: + return entries[songnr] + except IndexError: + raise ExtractorError( + 'No song with index: %s' % mobj.group('songnr'), + expected=True) + + return { + '_type': 'playlist', + 'id': tape['id'], + 'display_id': display_id, + 'title': tape['name'], + 'entries': entries, + 'thumbnail': tape.get('image_url'), + 'description': clean_html(tape.get('subtext')), + 'like_count': tape.get('likescount'), + 'uploader_id': tape.get('user_id'), + 'timestamp': parse_iso8601(tape.get('published_at')), + } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 1cca47771..d5e28efad 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -149,7 +149,7 @@ class TEDIE(SubtitlesInfoExtractor): thumbnail = 'http://' + thumbnail return { 'id': video_id, - 'title': talk_info['title'], + 'title': talk_info['title'].strip(), 'uploader': talk_info['speaker'], 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py new file mode 100644 index 000000000..a77c6a2fc --- /dev/null +++ b/youtube_dl/extractor/thesixtyone.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class TheSixtyOneIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)?thesixtyone\.com/ + (?:.*?/)* + (?: + s| + song/comments/list| + song + )/(?P<id>[A-Za-z0-9]+)/?$''' + _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}' + _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}.thesixtyone.com/thesixtyone_production/audio/{0:}_stream' + _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop' + _TESTS = [ + { + 'url': 'http://www.thesixtyone.com/s/SrE3zD7s1jt/', + 'md5': '821cc43b0530d3222e3e2b70bb4622ea', + 'info_dict': { + 'id': 'SrE3zD7s1jt', + 'ext': 'mp3', + 'title': 'CASIO - Unicorn War Mixtape', + 'thumbnail': 're:^https?://.*_desktop$', + 'upload_date': '20071217', + 'duration': 3208, + } + }, + { + 'url': 'http://www.thesixtyone.com/song/comments/list/SrE3zD7s1jt', + 'only_matching': True, + }, + { + 'url': 'http://www.thesixtyone.com/s/ULoiyjuJWli#/s/SrE3zD7s1jt/', + 'only_matching': True, + }, + { + 'url': 'http://www.thesixtyone.com/#/s/SrE3zD7s1jt/', + 'only_matching': True, + }, + { + 'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/', + 'only_matching': True, + }, + ] + + _DECODE_MAP = { + "x": "a", + "m": "b", + "w": "c", + "q": "d", + "n": "e", + "p": "f", + "a": "0", + "h": "1", + "e": "2", + "u": "3", + "s": "4", + "i": "5", + "o": "6", + "y": "7", + "r": "8", + "c": "9" + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + song_id = mobj.group('id') + + webpage = self._download_webpage( + self._SONG_URL_TEMPLATE.format(song_id), song_id) + + song_data = json.loads(self._search_regex( + r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data')) + keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']] + url = self._SONG_FILE_URL_TEMPLATE.format( + "".join(reversed(keys)), **song_data) + + formats = [{ + 'format_id': 'sd', + 'url': url, + 'ext': 'mp3', + }] + + return { + 'id': song_id, + 'title': '{artist:} - {name:}'.format(**song_data), + 'formats': formats, + 'comment_count': song_data.get('comments_count'), + 'duration': song_data.get('play_time'), + 'like_count': song_data.get('score'), + 'thumbnail': self._THUMBNAIL_URL_TEMPLATE.format(**song_data), + 'upload_date': unified_strdate(song_data.get('publish_date')), + } diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py index 607e947bb..496f15d80 100644 --- a/youtube_dl/extractor/thvideo.py +++ b/youtube_dl/extractor/thvideo.py @@ -26,8 +26,7 @@ class THVideoIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) # extract download link from mobile player page webpage_player = self._download_webpage( @@ -57,3 +56,29 @@ class THVideoIE(InfoExtractor): 'description': description, 'upload_date': upload_date } + + +class THVideoPlaylistIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://thvideo.tv/mylist2', + 'info_dict': { + 'id': '2', + 'title': '幻想万華鏡', + }, + 'playlist_mincount': 23, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + list_title = self._html_search_regex( + r'<h1 class="show_title">(.*?)<b id', webpage, 'playlist title', + fatal=False) + + entries = [ + self.url_result('http://thvideo.tv/v/th' + id, 'THVideo') + for id in re.findall(r'<dd><a href="http://thvideo.tv/v/th(\d+)/" target=', webpage)] + + return self.playlist_result(entries, playlist_id, list_title) diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index dc8697850..27962b5fe 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -17,16 +17,16 @@ class TvigleIE(InfoExtractor): _TESTS = [ { - 'url': 'http://www.tvigle.ru/video/brat-2/', - 'md5': '72cb7eab33e54314e1790da402d3c9c3', + 'url': 'http://www.tvigle.ru/video/brat/', + 'md5': 'ff4344a4894b0524441fb6f8218dc716', 'info_dict': { - 'id': '5119390', - 'display_id': 'brat-2', + 'id': '5118490', + 'display_id': 'brat', 'ext': 'mp4', - 'title': 'Брат 2 ', - 'description': 'md5:5751f4fe345a58e1692585c361294bd8', - 'duration': 7356.369, - 'age_limit': 0, + 'title': 'Брат', + 'description': 'md5:d16ac7c0b47052ea51fddb92c4e413eb', + 'duration': 5722.6, + 'age_limit': 16, }, }, { @@ -71,6 +71,7 @@ class TvigleIE(InfoExtractor): 'format_id': '%s-%s' % (vcodec, quality), 'vcodec': vcodec, 'height': int(quality[:-1]), + 'filesize': item['video_files_size'][vcodec][quality], }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index ebab8b86c..5b1a3ec78 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -5,7 +5,6 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_HTTPError, compat_urllib_request, ExtractorError, ) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 7d27d6c57..964470070 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -31,7 +31,7 @@ class VGTVIE(InfoExtractor): 'url': 'http://www.vgtv.no/#!/live/100764/opptak-vgtv-foelger-em-kvalifiseringen', 'info_dict': { 'id': '100764', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'OPPTAK: VGTV følger EM-kvalifiseringen', 'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3', 'thumbnail': 're:^https?://.*\.jpg', @@ -50,7 +50,7 @@ class VGTVIE(InfoExtractor): 'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen', 'info_dict': { 'id': '100015', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'DIREKTE: Her kan du se laksen live fra SuldalslÃ¥gen!', 'description': 'md5:9a60cc23fa349f761628924e56eeec2d', 'thumbnail': 're:^https?://.*\.jpg', diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bc01d7fbf..d2c36b58a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,17 +8,19 @@ import itertools from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( + clean_html, compat_HTTPError, compat_urllib_parse, compat_urllib_request, - clean_html, - get_element_by_attribute, + compat_urlparse, ExtractorError, + get_element_by_attribute, + InAdvancePagedList, + int_or_none, RegexNotFoundError, std_headers, unsmuggle_url, urlencode_postdata, - int_or_none, ) @@ -89,6 +91,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', + 'description': 'md5:380943ec71b89736ff4bf27183233d09', 'duration': 1595, }, }, @@ -103,6 +106,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader': 'The BLN & Business of Software', 'uploader_id': 'theblnbusinessofsoftware', 'duration': 3610, + 'description': None, }, }, { @@ -117,6 +121,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, + 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people who love them.', }, 'params': { 'videopassword': 'youtube-dl', @@ -203,6 +208,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): # Extract ID from URL mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + orig_url = url if mobj.group('pro') or mobj.group('player'): url = 'http://player.vimeo.com/video/' + video_id @@ -273,18 +279,23 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] # Extract video description - video_description = None - try: - video_description = get_element_by_attribute("class", "description_wrapper", webpage) - if video_description: - video_description = clean_html(video_description) - except AssertionError as err: - # On some pages like (http://player.vimeo.com/video/54469442) the - # html tags are not closed, python 2.6 cannot handle it - if err.args[0] == 'we should not get here!': - pass - else: - raise + + video_description = self._html_search_regex( + r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>', + webpage, 'description', default=None) + if not video_description: + video_description = self._html_search_meta( + 'description', webpage, default=None) + if not video_description and mobj.group('pro'): + orig_webpage = self._download_webpage( + orig_url, video_id, + note='Downloading webpage for description', + fatal=False) + if orig_webpage: + video_description = self._html_search_meta( + 'description', orig_webpage, default=None) + if not video_description and not mobj.group('player'): + self._downloader.report_warning('Cannot find video description') # Extract video duration video_duration = int_or_none(config["video"].get("duration")) @@ -529,3 +540,58 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE): def _real_extract(self, url): return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater') + + +class VimeoLikesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)' + IE_NAME = 'vimeo:likes' + IE_DESC = 'Vimeo user likes' + _TEST = { + 'url': 'https://vimeo.com/user755559/likes/', + 'playlist_mincount': 293, + "info_dict": { + "description": "See all the videos urza likes", + "title": 'Videos urza likes', + }, + } + + def _real_extract(self, url): + user_id = self._match_id(url) + webpage = self._download_webpage(url, user_id) + page_count = self._int( + self._search_regex( + r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)"> + .*?</a></li>\s*<li\s+class="pagination_next"> + ''', webpage, 'page count'), + 'page count', fatal=True) + PAGE_SIZE = 12 + title = self._html_search_regex( + r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False) + description = self._html_search_meta('description', webpage) + + def _get_page(idx): + page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % ( + self.http_scheme(), user_id, idx + 1) + webpage = self._download_webpage( + page_url, user_id, + note='Downloading page %d/%d' % (idx + 1, page_count)) + video_list = self._search_regex( + r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>', + webpage, 'video content') + paths = re.findall( + r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list) + for path in paths: + yield { + '_type': 'url', + 'url': compat_urlparse.urljoin(page_url, path), + } + + pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': 'user%s_likes' % user_id, + 'title': title, + 'description': description, + 'entries': pl, + } diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index fb0600f1a..ec3c010ad 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( compat_urllib_parse_urlparse, + ExtractorError, parse_duration, qualities, ) @@ -14,13 +15,12 @@ class VuClipIE(InfoExtractor): _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)' _TEST = { - 'url': 'http://m.vuclip.com/w?cid=843902317&fid=63532&z=1007&nvar&frm=index.html&bu=4757321434', - 'md5': '92ac9d1ccefec4f0bb474661ab144fcf', + 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html', 'info_dict': { - 'id': '843902317', + 'id': '922692425', 'ext': '3gp', - 'title': 'Movie Trailer: Noah', - 'duration': 139, + 'title': 'The Toy Soldiers - Hollywood Movie Trailer', + 'duration': 180, } } @@ -37,16 +37,32 @@ class VuClipIE(InfoExtractor): webpage = self._download_webpage( adfree_url, video_id, note='Download post-ad page') + error_msg = self._html_search_regex( + r'<p class="message">(.*?)</p>', webpage, 'error message', + default=None) + if error_msg: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_msg), expected=True) + + # These clowns alternate between two page types links_code = self._search_regex( - r'(?s)<div class="social align_c".*?>(.*?)<hr\s*/?>', webpage, - 'links') + r'''(?xs) + (?: + <img\s+src="/im/play.gif".*?>| + <!--\ player\ end\ -->\s*</div><!--\ thumb\ end--> + ) + (.*?) + (?: + <a\s+href="fblike|<div\s+class="social"> + ) + ''', webpage, 'links') title = self._html_search_regex( r'<title>(.*?)-\s*Vuclip', webpage, 'title').strip() quality_order = qualities(['Reg', 'Hi']) formats = [] for url, q in re.findall( - r'(?P[^<]+)', links_code): + r'[^"]+)".*?>(?:]*>)?(?P[^<]+)(?:)?', links_code): format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q formats.append({ 'format_id': format_id, @@ -56,7 +72,7 @@ class VuClipIE(InfoExtractor): self._sort_formats(formats) duration = parse_duration(self._search_regex( - r'\(([0-9:]+)\)', webpage, 'duration', fatal=False)) + r'\(([0-9:]+)\)', webpage, 'duration', fatal=False)) return { 'id': video_id, diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index 4e89acd81..bda3870db 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -13,37 +13,35 @@ class WorldStarHipHopIE(InfoExtractor): "info_dict": { "id": "wshh6a7q1ny0G34ZwuIO", "ext": "mp4", - "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" + "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" } } def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - webpage_src = self._download_webpage(url, video_id) - - m_vevo_id = re.search(r'videoId=(.*?)&?', - webpage_src) + m_vevo_id = re.search(r'videoId=(.*?)&?', webpage) if m_vevo_id is not None: return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo') video_url = self._search_regex( - r'so\.addVariable\("file","(.*?)"\)', webpage_src, 'video URL') + r'so\.addVariable\("file","(.*?)"\)', webpage, 'video URL') if 'youtube' in video_url: return self.url_result(video_url, ie='Youtube') video_title = self._html_search_regex( - r"(.*)", webpage_src, 'title') + r'(?s)
\s*

(.*?)

', + webpage, 'title') # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. thumbnail = self._html_search_regex( - r'rel="image_src" href="(.*)" />', webpage_src, 'thumbnail', + r'rel="image_src" href="(.*)" />', webpage, 'thumbnail', fatal=False) if not thumbnail: - _title = r"""candytitles.*>(.*)""" - mobj = re.search(_title, webpage_src) + _title = r'candytitles.*>(.*)' + mobj = re.search(_title, webpage) if mobj is not None: video_title = mobj.group(1) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 3ab6017cd..221341c13 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -37,16 +37,6 @@ class YahooIE(InfoExtractor): 'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', }, }, - { - 'url': 'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html', - 'md5': '410b7104aa9893b765bc22787a22f3d9', - 'info_dict': { - 'id': '516ed8e2-2c4f-339f-a211-7a8b49d30845', - 'ext': 'mp4', - 'title': 'The World Loves Spider-Man', - 'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''', - } - }, { 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', 'md5': '60e8ac193d8fb71997caa8fce54c6460', diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 24872861a..944d7da38 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -13,7 +13,7 @@ class YnetIE(InfoExtractor): _TESTS = [ { 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', - 'md5': '002b44ee2f33d50363a1c153bed524cf', + 'md5': '4b29cb57c3dddd57642b3f051f535b07', 'info_dict': { 'id': 'L-11659-99244', 'ext': 'flv', @@ -22,7 +22,7 @@ class YnetIE(InfoExtractor): } }, { 'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html', - 'md5': '6455046ae1b48cf7e2b7cae285e53a16', + 'md5': '8194c2ea221e9a639cac96b6b0753dc5', 'info_dict': { 'id': 'L-8859-84418', 'ext': 'flv', @@ -33,9 +33,7 @@ class YnetIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage)) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 07ed7cbd1..48d47a245 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals -import json import math import random import re diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 99198e380..9041cfa87 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,7 +26,7 @@ from ..utils import ( get_element_by_attribute, ExtractorError, int_or_none, - PagedList, + OnDemandPagedList, unescapeHTML, unified_strdate, orderedSet, @@ -655,6 +655,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Get video webpage url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id + pref_cookies = [ + c for c in self._downloader.cookiejar + if c.domain == '.youtube.com' and c.name == 'PREF'] + for pc in pref_cookies: + if 'hl=' in pc.value: + pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value) + else: + if pc.value: + pc.value += '&' + pc.value += 'hl=en' video_webpage = self._download_webpage(url, video_id) # Attempt to extract SWF player URL @@ -1341,7 +1351,7 @@ class YoutubeUserIE(InfoExtractor): 'id': video_id, 'title': title, } - url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) + url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE) return self.playlist_result(url_results, playlist_title=username) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 44dcb1e34..f651337ad 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -87,7 +87,7 @@ def parseOpts(overrideArguments=None): for private_opt in ['-p', '--password', '-u', '--username', '--video-password']: try: i = opts.index(private_opt) - opts[i+1] = '' + opts[i+1] = 'PRIVATE' except ValueError: pass return opts diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b644f4e92..d7ae5a90a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -673,6 +673,8 @@ class ExtractorError(Exception): expected = True if video_id is not None: msg = video_id + ': ' + msg + if cause: + msg += u' (caused by %r)' % cause if not expected: msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.' super(ExtractorError, self).__init__(msg) @@ -799,6 +801,12 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): del req.headers['User-agent'] req.headers['User-agent'] = req.headers['Youtubedl-user-agent'] del req.headers['Youtubedl-user-agent'] + + if sys.version_info < (2, 7) and '#' in req.get_full_url(): + # Python 2.6 is brain-dead when it comes to fragments + req._Request__original = req._Request__original.partition('#')[0] + req._Request__r_type = req._Request__r_type.partition('#')[0] + return req def http_response(self, req, resp): @@ -884,7 +892,9 @@ def unified_strdate(date_str): '%d/%m/%Y', '%d/%m/%y', '%Y/%m/%d %H:%M:%S', + '%d/%m/%Y %H:%M:%S', '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', '%d.%m.%Y %H:%M', '%d.%m.%Y %H.%M', '%Y-%m-%dT%H:%M:%SZ', @@ -1384,14 +1394,16 @@ def check_executable(exe, args=[]): class PagedList(object): - def __init__(self, pagefunc, pagesize): - self._pagefunc = pagefunc - self._pagesize = pagesize - def __len__(self): # This is only useful for tests return len(self.getslice()) + +class OnDemandPagedList(PagedList): + def __init__(self, pagefunc, pagesize): + self._pagefunc = pagefunc + self._pagesize = pagesize + def getslice(self, start=0, end=None): res = [] for pagenum in itertools.count(start // self._pagesize): @@ -1430,6 +1442,35 @@ class PagedList(object): return res +class InAdvancePagedList(PagedList): + def __init__(self, pagefunc, pagecount, pagesize): + self._pagefunc = pagefunc + self._pagecount = pagecount + self._pagesize = pagesize + + def getslice(self, start=0, end=None): + res = [] + start_page = start // self._pagesize + end_page = ( + self._pagecount if end is None else (end // self._pagesize + 1)) + skip_elems = start - start_page * self._pagesize + only_more = None if end is None else end - start + for pagenum in range(start_page, end_page): + page = list(self._pagefunc(pagenum)) + if skip_elems: + page = page[skip_elems:] + skip_elems = None + if only_more is not None: + if len(page) < only_more: + only_more -= len(page) + else: + page = page[:only_more] + res.extend(page) + break + res.extend(page) + return res + + def uppercase_escape(s): unicode_escape = codecs.getdecoder('unicode_escape') return re.sub( @@ -1534,33 +1575,37 @@ US_RATINGS = { } +def parse_age_limit(s): + if s is None: + return None + m = re.match(r'^(?P\d{1,2})\+?$', s) + return int(m.group('age')) if m else US_RATINGS.get(s, None) + + def strip_jsonp(code): return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code) def js_to_json(code): def fix_kv(m): - key = m.group(2) - if key.startswith("'"): - assert key.endswith("'") - assert '"' not in key - key = '"%s"' % key[1:-1] - elif not key.startswith('"'): - key = '"%s"' % key - - value = m.group(4) - if value.startswith("'"): - assert value.endswith("'") - assert '"' not in value - value = '"%s"' % value[1:-1] - - return m.group(1) + key + m.group(3) + value + v = m.group(0) + if v in ('true', 'false', 'null'): + return v + if v.startswith('"'): + return v + if v.startswith("'"): + v = v[1:-1] + v = re.sub(r"\\\\|\\'|\"", lambda m: { + '\\\\': '\\\\', + "\\'": "'", + '"': '\\"', + }[m.group(0)], v) + return '"%s"' % v res = re.sub(r'''(?x) - ([{,]\s*) - ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+) - (:\s*) - ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{) + "(?:[^"\\]*(?:\\\\|\\")?)*"| + '(?:[^'\\]*(?:\\\\|\\')?)*'| + [a-zA-Z_][a-zA-Z_0-9]* ''', fix_kv, code) res = re.sub(r',(\s*\])', lambda m: m.group(1), res) return res diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c17701d6a..4f0d486b9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.25' +__version__ = '2014.10.05.2'