From: Sergey M․ Date: Wed, 16 Jul 2014 12:13:35 +0000 (+0700) Subject: Merge branch 'MLB' of https://github.com/chaochichen/youtube-dl into chaochichen-MLB X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=1aa42fedee7df58ccd4e39b04a2a0438c5630f03;hp=172240c0a40f44d2aa384c512cc65c7e4c9e3660;p=youtube-dl Merge branch 'MLB' of https://github.com/chaochichen/youtube-dl into chaochichen-MLB --- diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 8d46fe108..d95533959 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -33,6 +33,12 @@ _TESTS = [ 90, u']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876', ), + ( + u'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl0Cbn9e.js', + u'js', + 84, + u'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVW@YZ!"#$%&\'()*+,-./:;<=', + ), ( u'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', u'js', diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 89a2cb3e8..5e16a5491 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -63,6 +63,7 @@ __authors__ = ( 'Ariset Llerena', 'Adam Malcontenti-Wilson', 'Tobias Bell', + 'Naglis Jonaitis', ) __license__ = 'Public Domain' diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f75939a05..14133c315 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -83,6 +83,7 @@ from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE +from .firedrive import FiredriveIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE @@ -232,6 +233,7 @@ from .radiofrance import RadioFranceIE from .rai import RaiIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE +from .reverbnation import ReverbNationIE from .ringtv import RingTVIE from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE @@ -240,6 +242,7 @@ from .rtbf import RTBFIE from .rtlnow import RTLnowIE from .rts import RTSIE from .rtve import RTVEALaCartaIE +from .ruhd import RUHDIE from .rutube import ( RutubeIE, RutubeChannelIE, @@ -268,8 +271,8 @@ from .soundcloud import ( SoundcloudPlaylistIE ) from .soundgasm import SoundgasmIE -from .southparkstudios import ( - SouthParkStudiosIE, +from .southpark import ( + SouthParkIE, SouthparkDeIE, ) from .space import SpaceIE diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py new file mode 100644 index 000000000..d26145db1 --- /dev/null +++ b/youtube_dl/extractor/firedrive.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + compat_urllib_parse, + compat_urllib_request, + determine_ext, +) + + +class FiredriveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \ + '(?:file|embed)/(?P[0-9a-zA-Z]+)' + _FILE_DELETED_REGEX = r'
' + + _TESTS = [{ + 'url': 'https://www.firedrive.com/file/FEB892FA160EBD01', + 'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970', + 'info_dict': { + 'id': 'FEB892FA160EBD01', + 'ext': 'flv', + 'title': 'bbb_theora_486kbit.flv', + 'thumbnail': 're:^http://.*\.jpg$', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + url = 'http://firedrive.com/file/%s' % video_id + + webpage = self._download_webpage(url, video_id) + + if re.search(self._FILE_DELETED_REGEX, webpage) is not None: + raise ExtractorError('Video %s does not exist' % video_id, + expected=True) + + fields = dict(re.findall(r'''(?x)(.+)
', + webpage, 'title') + thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage, + 'thumbnail', fatal=False) + if thumbnail is not None: + thumbnail = 'http:' + thumbnail + + ext = self._search_regex(r'type:\s?\'([^\']+)\',', + webpage, 'extension', fatal=False) + video_url = self._search_regex( + r'file:\s?\'(http[^\']+)\',', webpage, 'file url') + + formats = [{ + 'format_id': 'sd', + 'url': video_url, + 'ext': ext, + }] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index af9490ccc..228b42d2b 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -158,6 +158,9 @@ class MTVServicesInfoExtractor(InfoExtractor): if mgid.endswith('.swf'): mgid = mgid[:-4] except RegexNotFoundError: + mgid = None + + if mgid is None or ':' not in mgid: mgid = self._search_regex( [r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'], webpage, u'mgid') diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 3d6096e46..94d5ba982 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -18,15 +18,15 @@ class NDRIE(InfoExtractor): _TESTS = [ { - 'url': 'http://www.ndr.de/fernsehen/sendungen/markt/markt7959.html', - 'md5': 'e7a6079ca39d3568f4996cb858dd6708', + 'url': 'http://www.ndr.de/fernsehen/media/dienordreportage325.html', + 'md5': '4a4eeafd17c3058b65f0c8f091355855', 'note': 'Video file', 'info_dict': { - 'id': '7959', + 'id': '325', 'ext': 'mp4', - 'title': 'Markt - die ganze Sendung', - 'description': 'md5:af9179cf07f67c5c12dc6d9997e05725', - 'duration': 2655, + 'title': 'Blaue Bohnen aus Blocken', + 'description': 'md5:190d71ba2ccddc805ed01547718963bc', + 'duration': 1715, }, }, { diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index 0bc0859b4..6d5732d45 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -46,7 +46,7 @@ class PyvideoIE(InfoExtractor): return self.url_result(m_youtube.group(1), 'Youtube') title = self._html_search_regex( - r'
.*?([^>]+?)', + r'
\s*]*)?>([^>]+?)', webpage, 'title', flags=re.DOTALL) video_url = self._search_regex( [r'Download.*?\d+).*?$' + _TESTS = [{ + 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', + 'file': '16965047.mp3', + 'md5': '3da12ebca28c67c111a7f8b262d3f7a7', + 'info_dict': { + "title": "MONA LISA", + "uploader": "ALKILADOS", + "uploader_id": 216429, + "thumbnail": "//gp1.wac.edgecastcdn.net/802892/production_public/Photo/13761700/image/1366002176_AVATAR_MONA_LISA.jpg" + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + song_id = mobj.group('id') + + api_res = self._download_json( + 'https://api.reverbnation.com/song/%s?callback=api_response_5&_=%d' + % (song_id, int(time.time() * 1000)), + song_id, + transform_source=strip_jsonp, + note='Downloading information of song %s' % song_id + ) + + return { + 'id': song_id, + 'title': api_res.get('name'), + 'url': api_res.get('url'), + 'uploader': api_res.get('artist', {}).get('name'), + 'uploader_id': api_res.get('artist', {}).get('id'), + 'thumbnail': api_res.get('image', api_res.get('thumbnail')), + 'ext': 'mp3', + 'vcodec': 'none', + } diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py new file mode 100644 index 000000000..55b58e5e6 --- /dev/null +++ b/youtube_dl/extractor/ruhd.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RUHDIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P\d+)' + _TEST = { + 'url': 'http://www.ruhd.ru/play.php?vid=207', + 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', + 'info_dict': { + 'id': '207', + 'ext': 'divx', + 'title': 'КОТ бааааам', + 'description': 'классный кот)', + 'thumbnail': 're:^http://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + r'([^<]+)   RUHD.ru - Видео Высокого качества №1 в России!', webpage, 'title') + description = self._html_search_regex( + r'(?s)
(.+?)', webpage, 'description', fatal=False) + thumbnail = self._html_search_regex( + r'southpark\.cc\.com/(clips|full-episodes)/(?P.+?)(\?|#|$))' + + _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + + _TESTS = [{ + 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', + 'info_dict': { + 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'South Park|Bat Daded', + 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', + }, + }] + + +class SouthparkDeIE(SouthParkIE): + IE_NAME = 'southpark.de' + _VALID_URL = r'https?://(www\.)?(?Psouthpark\.de/(clips|alle-episoden)/(?P.+?)(\?|#|$))' + _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' + + _TESTS = [{ + 'url': 'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured', + 'info_dict': { + 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', + 'ext': 'mp4', + 'title': 'The Government Won\'t Respect My Privacy', + 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', + }, + }] diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py deleted file mode 100644 index aea8e6439..000000000 --- a/youtube_dl/extractor/southparkstudios.py +++ /dev/null @@ -1,36 +0,0 @@ -from __future__ import unicode_literals - -from .mtv import MTVServicesInfoExtractor - - -class SouthParkStudiosIE(MTVServicesInfoExtractor): - IE_NAME = 'southparkstudios.com' - _VALID_URL = r'https?://(www\.)?(?Psouthparkstudios\.com/(clips|full-episodes)/(?P.+?)(\?|#|$))' - - _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' - - _TESTS = [{ - 'url': 'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', - 'info_dict': { - 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', - 'ext': 'mp4', - 'title': 'Bat Daded', - 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', - }, - }] - - -class SouthparkDeIE(SouthParkStudiosIE): - IE_NAME = 'southpark.de' - _VALID_URL = r'https?://(www\.)?(?Psouthpark\.de/(clips|alle-episoden)/(?P.+?)(\?|#|$))' - _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' - - _TESTS = [{ - 'url': 'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured', - 'info_dict': { - 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', - 'ext': 'mp4', - 'title': 'The Government Won\'t Respect My Privacy', - 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', - }, - }] diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index ad175b83e..d848ee186 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from .brightcove import BrightcoveIE from .discovery import DiscoveryIE +from ..utils import compat_urlparse class TlcIE(DiscoveryIE): @@ -51,6 +52,10 @@ class TlcDeIE(InfoExtractor): # Otherwise we don't get the correct 'BrightcoveExperience' element, # example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/ iframe_url = iframe_url.replace('.htm?', '.php?') + url_fragment = compat_urlparse.urlparse(url).fragment + if url_fragment: + # Since the fragment is not send to the server, we always get the same iframe + iframe_url = re.sub(r'playlist=(\d+)', 'playlist=%s' % url_fragment, iframe_url) iframe = self._download_webpage(iframe_url, title) return { diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index c980153ec..d516b6427 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -1,21 +1,21 @@ from __future__ import unicode_literals + import base64 import re from .common import InfoExtractor -from ..utils import ( - compat_parse_qs, -) +from ..utils import compat_parse_qs class TutvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P[^/?]+)' _TEST = { - 'url': 'http://tu.tv/videos/noah-en-pabellon-cuahutemoc', - 'file': '2742556.flv', - 'md5': '5eb766671f69b82e528dc1e7769c5cb2', + 'url': 'http://tu.tv/videos/robots-futbolistas', + 'md5': '627c7c124ac2a9b5ab6addb94e0e65f7', 'info_dict': { - 'title': 'Noah en pabellon cuahutemoc', + 'id': '2973058', + 'ext': 'flv', + 'title': 'Robots futbolistas', }, } @@ -26,10 +26,9 @@ class TutvIE(InfoExtractor): webpage = self._download_webpage(url, video_id) internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, 'internal video ID') - data_url = 'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) - data_content = self._download_webpage(data_url, video_id, note='Downloading video info') - data = compat_parse_qs(data_content) - video_url = base64.b64decode(data['kpt'][0]).decode('utf-8') + data_content = self._download_webpage( + 'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') + video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8') return { 'id': internal_id, diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 3bbb07704..ae5bca2e6 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -11,6 +11,7 @@ class JSInterpreter(object): def __init__(self, code): self.code = code self._functions = {} + self._objects = {} def interpret_statement(self, stmt, local_vars, allow_recursion=20): if allow_recursion < 0: @@ -55,7 +56,19 @@ class JSInterpreter(object): m = re.match(r'^(?P[a-z]+)\.(?P.*)$', expr) if m: member = m.group('member') - val = local_vars[m.group('in')] + variable = m.group('in') + + if variable not in local_vars: + if variable not in self._objects: + self._objects[variable] = self.extract_object(variable) + obj = self._objects[variable] + key, args = member.split('(', 1) + args = args.strip(')') + argvals = [int(v) if v.isdigit() else local_vars[v] + for v in args.split(',')] + return obj[key](argvals) + + val = local_vars[variable] if member == 'split("")': return list(val) if member == 'join("")': @@ -97,6 +110,25 @@ class JSInterpreter(object): return self._functions[fname](argvals) raise ExtractorError('Unsupported JS expression %r' % expr) + def extract_object(self, objname): + obj = {} + obj_m = re.search( + (r'(?:var\s+)?%s\s*=\s*\{' % re.escape(objname)) + + r'\s*(?P([a-zA-Z$]+\s*:\s*function\(.*?\)\s*\{.*?\})*)' + + r'\}\s*;', + self.code) + fields = obj_m.group('fields') + # Currently, it only supports function definitions + fields_m = re.finditer( + r'(?P[a-zA-Z$]+)\s*:\s*function' + r'\((?P[a-z,]+)\){(?P[^}]+)}', + fields) + for f in fields_m: + argnames = f.group('args').split(',') + obj[f.group('key')] = self.build_function(argnames, f.group('code')) + + return obj + def extract_function(self, funcname): func_m = re.search( (r'(?:function %s|[{;]%s\s*=\s*function)' % ( @@ -107,10 +139,12 @@ class JSInterpreter(object): raise ExtractorError('Could not find JS function %r' % funcname) argnames = func_m.group('args').split(',') + return self.build_function(argnames, func_m.group('code')) + + def build_function(self, argnames, code): def resf(args): local_vars = dict(zip(argnames, args)) - for stmt in func_m.group('code').split(';'): + for stmt in code.split(';'): res = self.interpret_statement(stmt, local_vars) return res return resf - diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2cba2bfc1..64a9618ca 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1428,7 +1428,7 @@ US_RATINGS = { def strip_jsonp(code): - return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code) + return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code) def qualities(quality_ids): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2c9591630..4d606c3d2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.07.11.3' +__version__ = '2014.07.15'