From: Sergey M․ Date: Sat, 27 Sep 2014 19:56:51 +0000 (+0700) Subject: Merge branch 'ynet' of https://github.com/lenaten/youtube-dl into lenaten-ynet X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=6be1cd4ddb90698330b478cd24bee3e21c78e800;hp=2a1325fdde5c88fc052710b3c42fcc0d73153901;p=youtube-dl Merge branch 'ynet' of https://github.com/lenaten/youtube-dl into lenaten-ynet --- diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a1713dc5a..4a9610355 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1250,12 +1250,13 @@ class YoutubeDL(object): # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) # To work around aforementioned issue we will replace request's original URL with # percent-encoded one - url = req if isinstance(req, compat_str) else req.get_full_url() + req_is_string = isinstance(req, basestring if sys.version_info < (3, 0) else compat_str) + url = req if req_is_string else req.get_full_url() url_escaped = escape_url(url) # Substitute URL if any change after escaping if url != url_escaped: - if isinstance(req, compat_str): + if req_is_string: req = url_escaped else: req = compat_urllib_request.Request( diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 944e356ae..079221567 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -339,6 +339,7 @@ from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE from .spike import SpikeIE +from .sport5 import Sport5IE from .sportdeutschland import SportDeutschlandIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE diff --git a/youtube_dl/extractor/anysex.py b/youtube_dl/extractor/anysex.py index bc64423a3..ad86d6e58 100644 --- a/youtube_dl/extractor/anysex.py +++ b/youtube_dl/extractor/anysex.py @@ -35,7 +35,7 @@ class AnySexIE(InfoExtractor): title = self._html_search_regex(r'(.*?)', webpage, 'title') description = self._html_search_regex( - r'
([^<]+)
', webpage, 'description', fatal=False) + r'
]*>([^<]+)
', webpage, 'description', fatal=False) thumbnail = self._html_search_regex( r'preview_url\s*:\s*\'(.*?)\'', webpage, 'thumbnail', fatal=False) @@ -43,7 +43,7 @@ class AnySexIE(InfoExtractor): r'([^<]+)', webpage) duration = parse_duration(self._search_regex( - r'Duration: (\d+:\d+)', webpage, 'duration', fatal=False)) + r'Duration: (?:)?(\d+:\d+)', webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( r'Views: (\d+)', webpage, 'view count', fatal=False)) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 14a196ffc..aacbf1414 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -7,6 +7,7 @@ from ..utils import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urllib_parse, + str_to_int, ) @@ -20,6 +21,7 @@ class ExtremeTubeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', 'uploader': 'unknown', + 'view_count': int, 'age_limit': 18, } }, { @@ -39,8 +41,12 @@ class ExtremeTubeIE(InfoExtractor): video_title = self._html_search_regex( r'

]*?title="([^"]+)"[^>]*>', webpage, 'title') uploader = self._html_search_regex( - r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, 'uploader', - fatal=False) + r'Uploaded by:\s*\s*(.+?)\s*', + webpage, 'uploader', fatal=False) + view_count = str_to_int(self._html_search_regex( + r'Views:\s*\s*([\d,\.]+)', + webpage, 'view count', fatal=False)) + video_url = compat_urllib_parse.unquote(self._html_search_regex( r'video_url=(.+?)&', webpage, 'video_url')) path = compat_urllib_parse_urlparse(video_url).path @@ -51,6 +57,7 @@ class ExtremeTubeIE(InfoExtractor): 'id': video_id, 'title': video_title, 'uploader': uploader, + 'view_count': view_count, 'url': video_url, 'format': format, 'format_id': format, diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py new file mode 100644 index 000000000..3f680bfc6 --- /dev/null +++ b/youtube_dl/extractor/sport5.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class Sport5IE(InfoExtractor): + _VALID_URL = r'http://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P\d+)' + _TESTS = [ + { + 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', + 'info_dict': { + 'id': 's5-Y59xx1-GUh2', + 'ext': 'mp4', + 'title': 'ולנסיה-קורדובה 0:3', + 'description': 'אלקאסר, גאייה ופגולי סידרו לקבוצה של נונו ניצחון על קורדובה ואת המקום הראשון בליגה', + 'duration': 228, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', + }, { + 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE', + 'info_dict': { + 'id': 's5-SiXxx1-hKh2', + 'ext': 'mp4', + 'title': 'GOALS_CELTIC_270914.mp4', + 'description': '', + 'duration': 87, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', + } + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + media_id = mobj.group('id') + + webpage = self._download_webpage(url, media_id) + + video_id = self._html_search_regex('clipId=([\w-]+)', webpage, 'video id') + + metadata = self._download_xml( + 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id, + video_id) + + error = metadata.find('./Error') + if error is not None: + raise ExtractorError( + '%s returned error: %s - %s' % ( + self.IE_NAME, + error.find('./Name').text, + error.find('./Description').text), + expected=True) + + title = metadata.find('./Title').text + description = metadata.find('./Description').text + duration = int(metadata.find('./Duration').text) + + posters_el = metadata.find('./PosterLinks') + thumbnails = [{ + 'url': thumbnail.text, + 'width': int(thumbnail.get('width')), + 'height': int(thumbnail.get('height')), + } for thumbnail in posters_el.findall('./PosterIMG')] if posters_el is not None else [] + + categories_el = metadata.find('./Categories') + categories = [ + cat.get('name') for cat in categories_el.findall('./Category') + ] if categories_el is not None else [] + + formats = [{ + 'url': fmt.text, + 'ext': 'mp4', + 'vbr': int(fmt.get('bitrate')), + 'width': int(fmt.get('width')), + 'height': int(fmt.get('height')), + } for fmt in metadata.findall('./PlaybackLinks/FileURL')] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'duration': duration, + 'categories': categories, + 'formats': formats, + } \ No newline at end of file diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index df115d251..ebd64f0f5 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -19,7 +19,7 @@ class Vbox7IE(InfoExtractor): 'md5': '99f65c0c9ef9b682b97313e052734c3f', 'info_dict': { 'id': '249bb972c2', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Смях! Чудо - чист за секунди - Скрита камера', }, } @@ -50,7 +50,6 @@ class Vbox7IE(InfoExtractor): return { 'id': video_id, 'url': final_url, - 'ext': 'flv', 'title': title, 'thumbnail': thumbnail_url, }