%(upload_date)s for the upload date
(YYYYMMDD), %(extractor)s for the provider
(youtube, metacafe, etc), %(id)s for the
- video id, %(playlist)s for the playlist the
+ video id, %(playlist_title)s,
+ %(playlist_id)s, or %(playlist)s (=title if
+ present, ID otherwise) for the playlist the
video is in, %(playlist_index)s for the
- position in the playlist and %% for a
- literal percent. %(height)s and %(width)s
- for the width and height of the video
- format. %(resolution)s for a textual
+ position in the playlist. %(height)s and
+ %(width)s for the width and height of the
+ video format. %(resolution)s for a textual
description of the resolution of the video
- format. Use - to output to stdout. Can also
- be used to download to a different
- directory, for example with -o '/my/downloa
- ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
+ format. %% for a literal percent. Use - to
+ output to stdout. Can also be used to
+ download to a different directory, for
+ example with -o '/my/downloads/%(uploader)s
+ /%(title)s-%(id)s.%(ext)s' .
--autonumber-size NUMBER Specifies the number of digits in
%(autonumber)s when it is present in output
filename template or --auto-number option
"worst", "worstvideo" and "worstaudio". By
default, youtube-dl will pick the best
quality. Use commas to download multiple
- audio formats, such as -f
- 136/137/mp4/bestvideo,140/m4a/bestaudio
+ audio formats, such as -f
+ 136/137/mp4/bestvideo,140/m4a/bestaudio.
+ You can merge the video and audio of two
+ formats into a single file using -f <video-
+ format>+<audio-format> (requires ffmpeg or
+ avconv), for example -f
+ bestvideo+bestaudio.
--all-formats download all available video formats
--prefer-free-formats prefer free video formats unless a specific
one is requested
# Different instances of the downloader can't share the same dictionary
# some test set the "sublang" parameter, which would break the md5 checks.
params = get_params(override=override)
- super(FakeYDL, self).__init__(params)
+ super(FakeYDL, self).__init__(params, auto_init=False)
self.result = []
def to_screen(self, s, skip_eol=None):
all_names = youtube_dl.compat.__all__
present_names = set(filter(
lambda c: '_' in c and not c.startswith('_'),
- dir(youtube_dl.compat)))
+ dir(youtube_dl.compat))) - set(['unicode_literals'])
self.assertEqual(all_names, sorted(present_names))
if __name__ == '__main__':
params.setdefault('extract_flat', True)
params.setdefault('skip_download', True)
- ydl = YoutubeDL(params)
+ ydl = YoutubeDL(params, auto_init=False)
ydl.add_default_info_extractors()
finished_hook_called = set()
def _hook(status):
encodeFilename,
find_xpath_attr,
fix_xml_ampersands,
- get_meta_content,
orderedSet,
OnDemandPagedList,
InAdvancePagedList,
self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
- def test_meta_parser(self):
- testhtml = '''
- <head>
- <meta name="description" content="foo & bar">
- <meta content='Plato' name='author'/>
- </head>
- '''
- get_meta = lambda name: get_meta_content(name, testhtml)
- self.assertEqual(get_meta('description'), 'foo & bar')
- self.assertEqual(get_meta('author'), 'Plato')
-
def test_xpath_with_ns(self):
testxml = '''<root xmlns:media="http://example.com/">
<media:song>
import string
from youtube_dl.extractor import YoutubeIE
-from youtube_dl.utils import compat_str, compat_urlretrieve
+from youtube_dl.compat import compat_str, compat_urlretrieve
_TESTS = [
(
extra = {
'n_entries': n_entries,
'playlist': playlist,
+ 'playlist_id': ie_result.get('id'),
+ 'playlist_title': ie_result.get('title'),
'playlist_index': i + playliststart,
'extractor': ie_result['extractor'],
'webpage_url': ie_result['webpage_url'],
formats_info = (self.select_format(format_1, formats),
self.select_format(format_2, formats))
if all(formats_info):
+ # The first format must contain the video and the
+ # second the audio
+ if formats_info[0].get('vcodec') == 'none':
+ self.report_error('The first format must '
+ 'contain the video, try using '
+ '"-f %s+%s"' % (format_2, format_1))
+ return
selected_format = {
'requested_formats': formats_info,
'format': rf,
self.report_warning(
'Your Python is broken! Update to a newer and supported version')
+ stdout_encoding = getattr(
+ sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
encoding_str = (
'[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
locale.getpreferredencoding(),
sys.getfilesystemencoding(),
- sys.stdout.encoding,
+ stdout_encoding,
self.get_encoding()))
write_string(encoding_str, encoding=None)
from .vevo import VevoIE
from .vgtv import VGTVIE
from .vh1 import VH1IE
+from .vice import ViceIE
from .viddler import ViddlerIE
from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
_VALID_URL = r'http://www\.abc\.net\.au/news/[^/]+/[^/]+/(?P<id>\d+)'
_TEST = {
- 'url': 'http://www.abc.net.au/news/2014-07-25/bringing-asylum-seekers-to-australia-would-give/5624716',
- 'md5': 'dad6f8ad011a70d9ddf887ce6d5d0742',
+ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334',
+ 'md5': 'cb3dd03b18455a661071ee1e28344d9f',
'info_dict': {
- 'id': '5624716',
+ 'id': '5868334',
'ext': 'mp4',
- 'title': 'Bringing asylum seekers to Australia would give them right to asylum claims: professor',
- 'description': 'md5:ba36fa5e27e5c9251fd929d339aea4af',
+ 'title': 'Australia to help staff Ebola treatment centre in Sierra Leone',
+ 'description': 'md5:809ad29c67a05f54eb41f2a105693a67',
},
}
'id': '19546517',
'ext': 'mp4',
'title': 'Astérix - Le Domaine des Dieux Teaser VF',
- 'description': 'md5:4a754271d9c6f16c72629a8a993ee884',
+ 'description': 'md5:abcd09ce503c6560512c14ebfdb720d2',
'thumbnail': 're:http://.*\.jpg',
},
}, {
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
'playlist': [
{
- 'file': '1353101989.mp3',
'md5': '39bc1eded3476e927c724321ddf116cf',
'info_dict': {
+ 'id': '1353101989',
+ 'ext': 'mp3',
'title': 'Intro',
}
},
{
- 'file': '38097443.mp3',
'md5': '1a2c32e2691474643e912cc6cd4bffaa',
'info_dict': {
+ 'id': '38097443',
+ 'ext': 'mp3',
'title': 'Kero One - Keep It Alive (Blazo remix)',
}
},
],
+ 'info_dict': {
+ 'title': 'Jazz Format Mixtape vol.1',
+ },
'params': {
'playlistend': 2
},
compat_str,
compat_urllib_request,
compat_parse_qs,
+ compat_urllib_parse_urlparse,
determine_ext,
ExtractorError,
class BrightcoveIE(InfoExtractor):
- _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
+ _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*?\?(?P<query>.*)'
_FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
_TESTS = [
formats = []
for rend in renditions:
url = rend['defaultURL']
+ if not url:
+ continue
if rend['remote']:
- # This type of renditions are served through akamaihd.net,
- # but they don't use f4m manifests
- url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB'
- ext = 'flv'
+ url_comp = compat_urllib_parse_urlparse(url)
+ if url_comp.path.endswith('.m3u8'):
+ formats.extend(
+ self._extract_m3u8_formats(url, info['id'], 'mp4'))
+ continue
+ elif 'akamaihd.net' in url_comp.netloc:
+ # This type of renditions are served through
+ # akamaihd.net, but they don't use f4m manifests
+ url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB'
+ ext = 'flv'
else:
ext = determine_ext(url)
size = rend.get('size')
class BYUtvIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)'
_TEST = {
- 'url': 'http://www.byutv.org/watch/44e80f7b-e3ba-43ba-8c51-b1fd96c94a79/granite-flats-talking',
+ 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5',
'info_dict': {
- 'id': 'granite-flats-talking',
+ 'id': 'studio-c-season-5-episode-5',
'ext': 'mp4',
- 'description': 'md5:4e9a7ce60f209a33eca0ac65b4918e1c',
- 'title': 'Talking',
+ 'description': 'md5:5438d33774b6bdc662f9485a340401cc',
+ 'title': 'Season 5 Episode 5',
'thumbnail': 're:^https?://.*promo.*'
},
'params': {
'title': 'Developer Kick-Off Session: Stuff We Love',
'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
'duration': 4576,
- 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
+ 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
'session_code': 'KOS002',
'session_day': 'Day 1',
'session_room': 'Arena 1A',
'title': 'Self-service BI with Power BI - nuclear testing',
'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
'duration': 1540,
- 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
+ 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
'authors': [ 'Mike Wilmot' ],
},
}
def _extract_title(self, html):
title = self._html_search_meta('title', html, 'title')
- if title is None:
+ if title is None:
title = self._og_search_title(html)
TITLE_SUFFIX = ' (Channel 9)'
if title is not None and title.endswith(TITLE_SUFFIX):
return self._html_search_meta('description', html, 'description')
def _extract_duration(self, html):
- m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
+ m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
def _extract_slides(self, html):
return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
def _extract_content(self, html, content_path):
- # Look for downloadable content
+ # Look for downloadable content
formats = self._formats_from_html(html)
slides = self._extract_slides(html)
zip_ = self._extract_zip(html)
webpage = self._download_webpage(url, content_path, 'Downloading web page')
- page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
- if page_type_m is None:
- raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)
-
- page_type = page_type_m.group('pagetype')
- if page_type == 'List': # List page, may contain list of 'item'-like objects
+ page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
+ if page_type_m is not None:
+ page_type = page_type_m.group('pagetype')
+ if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
+ return self._extract_entry_item(webpage, content_path)
+ elif page_type == 'Session': # Event session page, may contain downloadable content
+ return self._extract_session(webpage, content_path)
+ elif page_type == 'Event':
+ return self._extract_list(content_path)
+ else:
+ raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
+
+ else: # Assuming list
return self._extract_list(content_path)
- elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
- return self._extract_entry_item(webpage, content_path)
- elif page_type == 'Session': # Event session page, may contain downloadable content
- return self._extract_session(webpage, content_path)
- else:
- raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)
\ No newline at end of file
webpage = self._download_webpage(url, display_id)
video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
- mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
+ mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<full_video_id>(?:Cinemassacre-)?(?P<video_id>.+?)))"', webpage)
if not mobj:
raise ExtractorError('Can\'t extract embed url and video id')
playerdata_url = mobj.group('embed_url')
video_id = mobj.group('video_id')
+ full_video_id = mobj.group('full_video_id')
video_title = self._html_search_regex(
r'<title>(?P<title>.+?)\|', webpage, 'title')
vidurl = self._search_regex(
r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/')
- videolist_url = self._search_regex(
- r"file\s*:\s*'(http.+?/jwplayer\.smil)'", playerdata, 'jwplayer.smil')
- videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
+ videolist_url = None
- formats = []
- baseurl = vidurl[:vidurl.rfind('/')+1]
- for video in videolist.findall('.//video'):
- src = video.get('src')
- if not src:
- continue
- file_ = src.partition(':')[-1]
- width = int_or_none(video.get('width'))
- height = int_or_none(video.get('height'))
- bitrate = int_or_none(video.get('system-bitrate'))
- format = {
- 'url': baseurl + file_,
- 'format_id': src.rpartition('.')[0].rpartition('_')[-1],
- }
- if width or height:
- format.update({
- 'tbr': bitrate // 1000 if bitrate else None,
- 'width': width,
- 'height': height,
- })
- else:
- format.update({
- 'abr': bitrate // 1000 if bitrate else None,
- 'vcodec': 'none',
- })
- formats.append(format)
- self._sort_formats(formats)
+ mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata)
+ if mobj:
+ videoserver = mobj.group('videoserver')
+ mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata)
+ vidid = mobj.group('vidid') if mobj else full_video_id
+ videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)
+ else:
+ mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata)
+ if mobj:
+ videolist_url = mobj.group('smil')
+
+ if videolist_url:
+ videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
+ formats = []
+ baseurl = vidurl[:vidurl.rfind('/')+1]
+ for video in videolist.findall('.//video'):
+ src = video.get('src')
+ if not src:
+ continue
+ file_ = src.partition(':')[-1]
+ width = int_or_none(video.get('width'))
+ height = int_or_none(video.get('height'))
+ bitrate = int_or_none(video.get('system-bitrate'))
+ format = {
+ 'url': baseurl + file_,
+ 'format_id': src.rpartition('.')[0].rpartition('_')[-1],
+ }
+ if width or height:
+ format.update({
+ 'tbr': bitrate // 1000 if bitrate else None,
+ 'width': width,
+ 'height': height,
+ })
+ else:
+ format.update({
+ 'abr': bitrate // 1000 if bitrate else None,
+ 'vcodec': 'none',
+ })
+ formats.append(format)
+ self._sort_formats(formats)
+ else:
+ formats = [{
+ 'url': vidurl,
+ }]
return {
'id': video_id,
_TESTS = [{
'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
- 'file': 'sports_2013_06_09_nadal-1-on-1.cnn.mp4',
'md5': '3e6121ea48df7e2259fe73a0628605c4',
'info_dict': {
+ 'id': 'sports_2013_06_09_nadal-1-on-1.cnn',
+ 'ext': 'mp4',
'title': 'Nadal wins 8th French Open title',
'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
'duration': 135,
},
{
"url": "http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29",
- "file": "us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4",
"md5": "b5cc60c60a3477d185af8f19a2a26f4e",
"info_dict": {
+ 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology',
+ 'ext': 'mp4',
"title": "Student's epic speech stuns new freshmen",
"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"",
"upload_date": "20130821",
video_info['title'] = playlist_title
return video_info
- def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+ def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Perform a regex search on the given string, using a single or a list of
patterns returning the first matching group.
_name = name
if mobj:
- # return the first matching group
- return next(g for g in mobj.groups() if g is not None)
+ if group is None:
+ # return the first matching group
+ return next(g for g in mobj.groups() if g is not None)
+ else:
+ return mobj.group(group)
elif default is not _NO_DEFAULT:
return default
elif fatal:
'please report this issue on http://yt-dl.org/bug' % _name)
return None
- def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+ def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
- res = self._search_regex(pattern, string, name, default, fatal, flags)
+ res = self._search_regex(pattern, string, name, default, fatal, flags, group)
if res:
return clean_html(res).strip()
else:
display_name = name
return self._html_search_regex(
r'''(?ix)<meta
- (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
- [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
- html, display_name, fatal=fatal, **kwargs)
+ (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+ [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
+ html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader')
compat_urllib_parse,
compat_urlparse,
unescapeHTML,
- get_meta_content,
)
class GameSpotIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
+ _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
_TEST = {
'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- page_id = mobj.group('page_id')
+ page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
- data_video_json = self._search_regex(r'data-video=["\'](.*?)["\']', webpage, 'data video')
+ data_video_json = self._search_regex(
+ r'data-video=["\'](.*?)["\']', webpage, 'data video')
data_video = json.loads(unescapeHTML(data_video_json))
# Transform the manifest url to a link to the mp4 files
http_path = f4m_path[1:].split('/', 1)[1]
http_template = re.sub(QUALITIES_RE, r'%s', http_path)
http_template = http_template.replace('.csmil/manifest.f4m', '')
- http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template)
+ http_template = compat_urlparse.urljoin(
+ 'http://video.gamespotcdn.com/', http_template)
formats = []
for q in qualities:
formats.append({
return {
'id': data_video['guid'],
+ 'display_id': page_id,
'title': compat_urllib_parse.unquote(data_video['title']),
'formats': formats,
- 'description': get_meta_content('description', webpage),
+ 'description': self._html_search_meta('description', webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}
'uploader': 'Championat',
},
},
+ {
+ # https://github.com/rg3/youtube-dl/issues/3541
+ 'add_ie': ['Brightcove'],
+ 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
+ 'info_dict': {
+ 'id': '3866516442001',
+ 'ext': 'mp4',
+ 'title': 'Leer mij vrouwen kennen: Aflevering 1',
+ 'description': 'Leer mij vrouwen kennen: Aflevering 1',
+ 'uploader': 'SBS Broadcasting',
+ },
+ 'skip': 'Restricted to Netherlands',
+ 'params': {
+ 'skip_download': True, # m3u8 download
+ },
+ },
# Direct link to a video
{
'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
return {
'id': video_id,
'title': os.path.splitext(url_basename(url))[0],
+ 'direct': True,
'formats': [{
'format_id': m.group('format_id'),
'url': url,
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
- str_to_int,
ExtractorError,
)
-import json
class GoshgayIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._search_regex(r'class="video-title"><h1>(.+?)<', webpage, 'title')
+ title = self._og_search_title(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ family_friendly = self._html_search_meta(
+ 'isFamilyFriendly', webpage, default='false')
+ config_url = self._search_regex(
+ r"'config'\s*:\s*'([^']+)'", webpage, 'config URL')
- player_config = self._search_regex(
- r'(?s)jwplayer\("player"\)\.setup\(({.+?})\)', webpage, 'config settings')
- player_vars = json.loads(player_config.replace("'", '"'))
- width = str_to_int(player_vars.get('width'))
- height = str_to_int(player_vars.get('height'))
- config_uri = player_vars.get('config')
+ config = self._download_xml(
+ config_url, video_id, 'Downloading player config XML')
- if config_uri is None:
- raise ExtractorError('Missing config URI')
- node = self._download_xml(config_uri, video_id, 'Downloading player config XML',
- errnote='Unable to download XML')
- if node is None:
+ if config is None:
raise ExtractorError('Missing config XML')
- if node.tag != 'config':
+ if config.tag != 'config':
raise ExtractorError('Missing config attribute')
- fns = node.findall('file')
- imgs = node.findall('image')
- if len(fns) != 1:
+ fns = config.findall('file')
+ if len(fns) < 1:
raise ExtractorError('Missing media URI')
video_url = fns[0].text
- if len(imgs) < 1:
- thumbnail = None
- else:
- thumbnail = imgs[0].text
url_comp = compat_urlparse.urlparse(url)
ref = "%s://%s%s" % (url_comp[0], url_comp[1], url_comp[2])
'id': video_id,
'url': video_url,
'title': title,
- 'width': width,
- 'height': height,
'thumbnail': thumbnail,
'http_referer': ref,
- 'age_limit': 18,
+ 'age_limit': 0 if family_friendly == 'true' else 18,
}
from .common import InfoExtractor
-from ..utils import ExtractorError, compat_urllib_request, compat_html_parser
-
-from ..utils import (
+from ..compat import (
+ compat_html_parser,
compat_urllib_parse,
+ compat_urllib_request,
compat_urlparse,
)
+from ..utils import ExtractorError
class GroovesharkHtmlParser(compat_html_parser.HTMLParser):
from .common import InfoExtractor
from ..utils import (
- get_meta_content,
+ determine_ext,
int_or_none,
parse_iso8601,
)
'title': (
"Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone"
),
- 'format_id': 'mp4_720',
+ 'format_id': 'mp4_720p',
'timestamp': 1411812600,
'upload_date': '20140927',
'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.',
- 'thumbnail': 're:https?://.*\.jpg$',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
}
}
info = {
'id': video_id,
'thumbnail': self._og_search_thumbnail(webpage),
- 'timestamp': parse_iso8601(get_meta_content('date', webpage)),
+ 'timestamp': parse_iso8601(
+ self._html_search_meta('date', webpage)),
'description': self._og_search_description(webpage),
}
- title = get_meta_content('fulltitle', webpage)
+ title = self._html_search_meta('fulltitle', webpage)
if title:
info['title'] = title
else:
label = source_node.attrib['label']
height = int_or_none(self._search_regex(
r'^(.*?_)?([0-9]+)p$', label, 'height', default=None))
+ video_url = source_node.attrib['file']
+ ext = determine_ext(video_url, '')
formats.append({
- 'url': source_node.attrib['file'],
+ 'url': video_url,
'format_note': label,
+ 'format_id': '%s_%s' % (ext, label),
'height': height,
})
self._sort_formats(formats)
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
- get_element_by_attribute,
)
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id)
- descr = get_element_by_attribute('itemprop', 'description', webpage)
+ descr = self._html_search_regex(
+ r'(?s)<span itemprop="description">(.*?)</span>',
+ webpage, 'description', fatal=False)
available_formats = re.findall(
r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
flags=re.MULTILINE)
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- list_id = mobj.group('id')
-
+ list_id = self._match_id(url)
webpage = self._download_webpage(url, list_id)
entries = [
self.url_result('http://www.imdb.com' + m, 'Imdb')
from .common import InfoExtractor
from ..utils import (
- get_element_by_id,
- parse_iso8601,
determine_ext,
- int_or_none,
float_or_none,
+ get_element_by_id,
+ int_or_none,
+ parse_iso8601,
str_to_int,
)
'description': 'md5:253753e2655dde93f59f74b572454f6d',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'pelikzzle',
- 'timestamp': 1404298698,
+ 'timestamp': 1404302298,
'upload_date': '20140702',
'duration': 95.395,
'age_limit': 0,
'description': 'Tarkan Dortmund 2006 Konseri',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'parlayankiz',
- 'timestamp': 1163318593,
+ 'timestamp': 1163322193,
'upload_date': '20061112',
'duration': 253.666,
'age_limit': 0,
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- url = 'http://www.izlesene.com/video/%s' % video_id
+ video_id = self._match_id(url)
+ url = 'http://www.izlesene.com/video/%s' % video_id
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage)
m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)
if not m:
return rtmp_video_url
- base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
+ base = 'http://viacommtvstrmfs.fplive.net/'
return base + m.group('finalid')
def _get_feed_url(self, uri):
_VALID_URL = r'http://www\.myspass\.de/.*'
_TEST = {
'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/',
- 'file': '11741.mp4',
'md5': '0b49f4844a068f8b33f4b7c88405862b',
'info_dict': {
+ 'id': '11741',
+ 'ext': 'mp4',
"description": "Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?",
"title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2",
},
thumbnail = None
- video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
+ video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.(lo|hi|hq)\.mp4', type:"video/mp4"},''', page)
if video_url:
thumbnails = re.findall(r'''\d+: \{src: "([^"]+)"(?: \|\| '[^']+')?, quality: '([^']+)'}''', page)
if thumbnails:
unified_strdate,
parse_duration,
int_or_none,
+ ExtractorError,
)
flv_info_request, video_id,
note='Downloading flv info', errnote='Unable to download flv info')
+ if 'deleted=' in flv_info_webpage:
+ raise ExtractorError('The video has been deleted.',
+ expected=True)
video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
# Start extracting information
entries = [{
'_type': 'url',
'ie_key': NiconicoIE.ie_key(),
- 'url': 'http://www.nicovideo.jp/watch/%s' % entry['item_id'],
+ 'url': ('http://www.nicovideo.jp/watch/%s' %
+ entry['item_data']['video_id']),
} for entry in entries]
return {
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
compat_urllib_parse,
compat_urllib_request,
)
video_id = self._match_id(url)
orig_webpage = self._download_webpage(url, video_id)
+
+ m_error = re.search(
+ r'(?s)Reason for deletion:.*?<b class="err"[^>]*>(?P<msg>[^<]+)</b>', orig_webpage)
+ if m_error:
+ raise ExtractorError(m_error.group('msg'), expected=True)
+
fields = re.findall(
r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage)
data = dict(fields)
class StreamcloudIE(InfoExtractor):
IE_NAME = 'streamcloud.eu'
- _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)/(?P<fname>[^#?]*)\.html'
+ _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)(?:/(?P<fname>[^#?]*)\.html)?'
_TEST = {
'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html',
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
+ url = 'http://streamcloud.eu/%s' % video_id
orig_webpage = self._download_webpage(url, video_id)
if media_type == 'Video':
fmt.update({
- 'format_note': ['144p', '288p', '544p'][quality-1],
+ 'format_note': ['144p', '288p', '544p', '720p'][quality-1],
'vcodec': codec,
})
elif media_type == 'Audio':
'uploader': attr['channel_title'],
'uploader_id': attr['channel_idkey'],
'formats': formats,
- }
\ No newline at end of file
+ }
request = compat_urllib_request.Request(playlist_url)
request.add_header('X-Requested-With', 'XMLHttpRequest')
request.add_header('Accept', 'application/json')
+ request.add_header('Referer', url)
playlist = self._download_json(request, display_id)
'actively fooling us.'),
'uploader': 'Dan Dennett',
'width': 854,
+ 'duration': 1308,
}
}, {
'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
'title': 'Be passionate. Be courageous. Be your best.',
'uploader': 'Gabby Giffords and Mark Kelly',
'description': 'md5:5174aed4d0f16021b704120360f72b92',
+ 'duration': 1128,
},
}, {
'url': 'http://www.ted.com/playlists/who_are_the_hackers',
'description': self._og_search_description(webpage),
'subtitles': video_subtitles,
'formats': formats,
+ 'duration': talk_info.get('duration'),
}
def _get_available_subtitles(self, video_id, talk_info):
# filehd is always 404
video_url = xpath_text(config, './file', 'video URL', fatal=True)
- title = xpath_text(config, './title', 'title')
+ title = xpath_text(config, './title', 'title').strip()
thumbnail = xpath_text(config, './image', ' thumbnail')
return {
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
- get_meta_content,
)
m = re.match(self._VALID_URL, url)
display_id = m.group('slug')
webpage = self._download_webpage(url, display_id)
- channel_id = get_meta_content('ustream:channel_id', webpage)
+ channel_id = self._html_search_meta('ustream:channel_id', webpage)
BASE = 'http://www.ustream.tv'
next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id
--- /dev/null
+from __future__ import unicode_literals
+import re
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+from ..utils import ExtractorError
+
+
+class ViceIE(InfoExtractor):
+ _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
+
+ _TEST = {
+ 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
+ 'info_dict': {
+ 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+ 'ext': 'mp4',
+ 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+ },
+ 'params': {
+ # Requires ffmpeg (m3u8 manifest)
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ name = mobj.group('name')
+ webpage = self._download_webpage(url, name)
+ try:
+ embed_code = self._search_regex(
+ r'embedCode=([^&\'"]+)', webpage,
+ 'ooyala embed code')
+ ooyala_url = OoyalaIE._url_for_embed_code(embed_code)
+ print(ooyala_url)
+ except ExtractorError:
+ raise ExtractorError('The page doesn\'t contain a video', expected=True)
+ return self.url_result(ooyala_url, ie='Ooyala')
+
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
- r"'file'\s*:\s*'([^']+)'", webpage, 'video URL')
+ r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL')
if YoutubeIE.suitable(video_url):
self.to_screen('Found YouTube video')
return {
'description': 'md5:7fb5ef3c21c5893375fda51d9b15d9cd',
},
}, {
- 'url': 'http://w729.wrzuta.pl/audio/9oXJqdcndqv/david_guetta_amp_showtek_ft._vassy_-_bad',
- 'md5': '1e546a18e1c22ac6e9adce17b8961ff5',
+ 'url': 'http://jolka85.wrzuta.pl/audio/063jOPX5ue2/liber_natalia_szroeder_-_teraz_ty',
+ 'md5': 'bc78077859bea7bcfe4295d7d7fc9025',
'info_dict': {
- 'id': '9oXJqdcndqv',
+ 'id': '063jOPX5ue2',
'ext': 'ogg',
- 'title': 'David Guetta & Showtek ft. Vassy - Bad',
- 'duration': 270,
- 'uploader_id': 'w729',
- 'description': 'md5:4628f01c666bbaaecefa83476cfa794a',
+ 'title': 'Liber & Natalia Szroeder - Teraz Ty',
+ 'duration': 203,
+ 'uploader_id': 'jolka85',
+ 'description': 'md5:2d2b6340f9188c8c4cd891580e481096',
},
}]
quality = qualities(['SD', 'MQ', 'HQ', 'HD'])
- audio_table = {'flv': 'mp3', 'webm': 'ogg'}
+ audio_table = {'flv': 'mp3', 'webm': 'ogg', '???': 'mp3'}
embedpage = self._download_json('http://www.wrzuta.pl/npp/embed/%s/%s' % (uploader, video_id), video_id)
formats = []
for media in embedpage['url']:
+ fmt = media['type'].split('@')[0]
if typ == 'audio':
- ext = audio_table[media['type'].split('@')[0]]
+ ext = audio_table.get(fmt, fmt)
else:
- ext = media['type'].split('@')[0]
+ ext = fmt
formats.append({
'format_id': '%s_%s' % (ext, media['quality'].lower()),
class YouJizzIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+)\.html$'
+ _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/[^/#?]+-(?P<id>[0-9]+)\.html(?:$|[?#])'
_TEST = {
'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
- 'file': '2189178.flv',
'md5': '07e15fa469ba384c7693fd246905547c',
'info_dict': {
+ 'id': '2189178',
+ 'ext': 'flv',
"title": "Zeichentrick 1",
"age_limit": 18,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('videoid')
-
- # Get webpage content
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
-
age_limit = self._rta_search(webpage)
-
- # Get the video title
- video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
- webpage, 'title').strip()
-
- # Get the embed page
- result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
- if result is None:
- raise ExtractorError('ERROR: unable to extract embed page')
-
- embed_page_url = result.group(0).strip()
- video_id = result.group('videoid')
-
- webpage = self._download_webpage(embed_page_url, video_id)
+ video_title = self._html_search_regex(
+ r'<title>\s*(.*)\s*</title>', webpage, 'title')
+
+ embed_page_url = self._search_regex(
+ r'(https?://www.youjizz.com/videos/embed/[0-9]+)',
+ webpage, 'embed page')
+ webpage = self._download_webpage(
+ embed_page_url, video_id, note='downloading embed page')
# Get the video URL
m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P<playlist>.+?)"\);', webpage)
# Get video info
self.report_video_info_webpage_download(video_id)
if re.search(r'player-age-gate-content">', video_webpage) is not None:
- self.report_age_confirmation()
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
'video_id': video_id,
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
'sts': self._search_regex(
- r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
+ r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
})
video_info_url = proto + '://www.youtube.com/get_video_info?' + data
- video_info_webpage = self._download_webpage(video_info_url, video_id,
- note=False,
- errnote='unable to download video info webpage')
+ video_info_webpage = self._download_webpage(
+ video_info_url, video_id,
+ note='Refetching age-gated info webpage',
+ errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
else:
age_gate = False
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
'info_dict': {
'title': 'ytdl test PL',
+ 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
},
'playlist_count': 3,
}, {
video_format.add_option(
'-f', '--format',
action='store', dest='format', metavar='FORMAT', default=None,
- help='video format code, specify the order of preference using slashes: -f 22/17/18 . -f mp4 , -f m4a and -f flv are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality. Use commas to download multiple audio formats, such as -f 136/137/mp4/bestvideo,140/m4a/bestaudio')
+ help='video format code, specify the order of preference using'
+ ' slashes: -f 22/17/18 . -f mp4 , -f m4a and -f flv are also'
+ ' supported. You can also use the special names "best",'
+ ' "bestvideo", "bestaudio", "worst", "worstvideo" and'
+ ' "worstaudio". By default, youtube-dl will pick the best quality.'
+ ' Use commas to download multiple audio formats, such as'
+ ' -f 136/137/mp4/bestvideo,140/m4a/bestaudio.'
+ ' You can merge the video and audio of two formats into a single'
+ ' file using -f <video-format>+<audio-format> (requires ffmpeg or'
+ ' avconv), for example -f bestvideo+bestaudio.')
video_format.add_option(
'--all-formats',
action='store_const', dest='format', const='all',
'%(format_id)s for the unique id of the format (like Youtube\'s itags: "137"), '
'%(upload_date)s for the upload date (YYYYMMDD), '
'%(extractor)s for the provider (youtube, metacafe, etc), '
- '%(id)s for the video id, %(playlist)s for the playlist the video is in, '
- '%(playlist_index)s for the position in the playlist and %% for a literal percent. '
+ '%(id)s for the video id, '
+ '%(playlist_title)s, %(playlist_id)s, or %(playlist)s (=title if present, ID otherwise) for the playlist the video is in, '
+ '%(playlist_index)s for the position in the playlist. '
'%(height)s and %(width)s for the width and height of the video format. '
'%(resolution)s for a textual description of the resolution of the video format. '
+ '%% for a literal percent. '
'Use - to output to stdout. Can also be used to download to a different directory, '
'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
filesystem.add_option(
compat_chr,
compat_getenv,
compat_html_entities,
- compat_html_parser,
compat_parse_qs,
compat_str,
compat_urllib_error,
return n.text
-compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class BaseHTMLParser(compat_html_parser.HTMLParser):
- def __init(self):
- compat_html_parser.HTMLParser.__init__(self)
- self.html = None
-
- def loads(self, html):
- self.html = html
- self.feed(html)
- self.close()
-
-class AttrParser(BaseHTMLParser):
- """Modified HTMLParser that isolates a tag with the specified attribute"""
- def __init__(self, attribute, value):
- self.attribute = attribute
- self.value = value
- self.result = None
- self.started = False
- self.depth = {}
- self.watch_startpos = False
- self.error_count = 0
- BaseHTMLParser.__init__(self)
-
- def error(self, message):
- if self.error_count > 10 or self.started:
- raise compat_html_parser.HTMLParseError(message, self.getpos())
- self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
- self.error_count += 1
- self.goahead(1)
-
- def handle_starttag(self, tag, attrs):
- attrs = dict(attrs)
- if self.started:
- self.find_startpos(None)
- if self.attribute in attrs and attrs[self.attribute] == self.value:
- self.result = [tag]
- self.started = True
- self.watch_startpos = True
- if self.started:
- if not tag in self.depth: self.depth[tag] = 0
- self.depth[tag] += 1
-
- def handle_endtag(self, tag):
- if self.started:
- if tag in self.depth: self.depth[tag] -= 1
- if self.depth[self.result[0]] == 0:
- self.started = False
- self.result.append(self.getpos())
-
- def find_startpos(self, x):
- """Needed to put the start position of the result (self.result[1])
- after the opening tag with the requested id"""
- if self.watch_startpos:
- self.watch_startpos = False
- self.result.append(self.getpos())
- handle_entityref = handle_charref = handle_data = handle_comment = \
- handle_decl = handle_pi = unknown_decl = find_startpos
-
- def get_result(self):
- if self.result is None:
- return None
- if len(self.result) != 3:
- return None
- lines = self.html.split('\n')
- lines = lines[self.result[1][0]-1:self.result[2][0]]
- lines[0] = lines[0][self.result[1][1]:]
- if len(lines) == 1:
- lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
- lines[-1] = lines[-1][:self.result[2][1]]
- return '\n'.join(lines).strip()
-# Hack for https://github.com/rg3/youtube-dl/issues/662
-if sys.version_info < (2, 7, 3):
- AttrParser.parse_endtag = (lambda self, i:
- i + len("</scr'+'ipt>")
- if self.rawdata[i:].startswith("</scr'+'ipt>")
- else compat_html_parser.HTMLParser.parse_endtag(self, i))
-
def get_element_by_id(id, html):
"""Return the content of the tag with the specified ID in the passed HTML document"""
return get_element_by_attribute("id", id, html)
+
def get_element_by_attribute(attribute, value, html):
"""Return the content of the tag with the specified attribute in the passed HTML document"""
- parser = AttrParser(attribute, value)
- try:
- parser.loads(html)
- except compat_html_parser.HTMLParseError:
- pass
- return parser.get_result()
-class MetaParser(BaseHTMLParser):
- """
- Modified HTMLParser that isolates a meta tag with the specified name
- attribute.
- """
- def __init__(self, name):
- BaseHTMLParser.__init__(self)
- self.name = name
- self.content = None
- self.result = None
-
- def handle_starttag(self, tag, attrs):
- if tag != 'meta':
- return
- attrs = dict(attrs)
- if attrs.get('name') == self.name:
- self.result = attrs.get('content')
+ m = re.search(r'''(?xs)
+ <([a-zA-Z0-9:._-]+)
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ \s+%s=['"]?%s['"]?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ \s*>
+ (?P<content>.*?)
+ </\1>
+ ''' % (re.escape(attribute), re.escape(value)), html)
- def get_result(self):
- return self.result
+ if not m:
+ return None
+ res = m.group('content')
-def get_meta_content(name, html):
- """
- Return the content attribute from the meta tag with the given name attribute.
- """
- parser = MetaParser(name)
- try:
- parser.loads(html)
- except compat_html_parser.HTMLParseError:
- pass
- return parser.get_result()
+ if res.startswith('"') or res.startswith("'"):
+ res = res[1:-1]
+
+ return unescapeHTML(res)
def clean_html(html):
-__version__ = '2014.11.02.1'
+__version__ = '2014.11.12.1'