import os
import re
+import xml.etree.ElementTree
from .common import InfoExtractor
from .youtube import YoutubeIE
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
+ compat_xml_parse_error,
ExtractorError,
HEADRequest,
'title': 'R\u00e9gis plante sa Jeep',
}
},
- # embedded vimeo video
- {
- 'add_ie': ['Vimeo'],
- 'url': 'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',
- 'file': '22444065.mp4',
- 'md5': '2903896e23df39722c33f015af0666e2',
- 'info_dict': {
- 'title': 'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011',
- 'uploader_id': 'skillsmatter',
- 'uploader': 'Skills Matter',
- }
- },
# bandcamp page with custom domain
{
'add_ie': ['Bandcamp'],
'skip_download': True,
},
},
+ {
+ # https://github.com/rg3/youtube-dl/issues/2253
+ 'url': 'http://bcove.me/i6nfkrc3',
+ 'file': '3101154703001.mp4',
+ 'md5': '0ba9446db037002366bab3b3eb30c88c',
+ 'info_dict': {
+ 'title': 'Still no power',
+ 'uploader': 'thestar.com',
+ 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
+ },
+ 'add_ie': ['Brightcove'],
+ },
# Direct link to a video
{
'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
- 'file': 'trailer.mp4',
'md5': '67d406c2bcb6af27fa886f31aa934bbe',
'info_dict': {
'id': 'trailer',
+ 'ext': 'mp4',
'title': 'trailer',
'upload_date': '20100513',
}
'info_dict': {
'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
'ext': 'mp4',
- 'title': '2cc213299525360.mov', #that's what we get
+ 'title': '2cc213299525360.mov', # that's what we get
},
},
+ # google redirect
+ {
+ 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+ 'info_dict': {
+ 'id': 'cmQHVoWB5FY',
+ 'ext': 'mp4',
+ 'upload_date': '20130224',
+ 'uploader_id': 'TheVerge',
+ 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
+ 'uploader': 'The Verge',
+ 'title': 'First Firefox OS phones side-by-side',
+ },
+ 'params': {
+ 'skip_download': False,
+ }
+ }
]
def report_download_webpage(self, video_id):
raise ExtractorError('Invalid URL protocol')
return response
+ def _extract_rss(self, url, video_id, doc):
+ playlist_title = doc.find('./channel/title').text
+ playlist_desc_el = doc.find('./channel/description')
+ playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
+
+ entries = [{
+ '_type': 'url',
+ 'url': e.find('link').text,
+ 'title': e.find('title').text,
+ } for e in doc.findall('./channel/item')]
+
+ return {
+ '_type': 'playlist',
+ 'id': url,
+ 'title': playlist_title,
+ 'description': playlist_desc,
+ 'entries': entries,
+ }
+
def _real_extract(self, url):
parsed_url = compat_urlparse.urlparse(url)
if not parsed_url.scheme:
- self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
- return self.url_result('http://' + url)
+ default_search = self._downloader.params.get('default_search')
+ if default_search is None:
+ default_search = 'auto'
+
+ if default_search == 'auto':
+ if '/' in url:
+ self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
+ return self.url_result('http://' + url)
+ else:
+ return self.url_result('ytsearch:' + url)
+ else:
+ assert ':' in default_search
+ return self.url_result(default_search + url)
video_id = os.path.splitext(url.split('/')[-1])[0]
self.to_screen('%s: Requesting header' % video_id)
self.report_extraction(video_id)
+ # Is it an RSS feed?
+ try:
+ doc = xml.etree.ElementTree.fromstring(webpage.encode('utf-8'))
+ if doc.tag == 'rss':
+ return self._extract_rss(url, video_id, doc)
+ except compat_xml_parse_error:
+ pass
+
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
# Look for BrightCove:
- bc_url = BrightcoveIE._extract_brightcove_url(webpage)
- if bc_url is not None:
+ bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
+ if bc_urls:
self.to_screen('Brightcove video detected.')
- surl = smuggle_url(bc_url, {'Referer': url})
- return self.url_result(surl, 'Brightcove')
+ entries = [{
+ '_type': 'url',
+ 'url': smuggle_url(bc_url, {'Referer': url}),
+ 'ie_key': 'Brightcove'
+ } for bc_url in bc_urls]
+
+ return {
+ '_type': 'playlist',
+ 'title': video_title,
+ 'id': video_id,
+ 'entries': entries,
+ }
# Look for embedded (iframe) Vimeo player
mobj = re.search(
- r'<iframe[^>]+?src="((?:https?:)?//player.vimeo.com/video/.+?)"', webpage)
+ r'<iframe[^>]+?src="((?:https?:)?//player\.vimeo\.com/video/.+?)"', webpage)
if mobj:
player_url = unescapeHTML(mobj.group(1))
surl = smuggle_url(player_url, {'Referer': url})
# Look for embedded (swf embed) Vimeo player
mobj = re.search(
- r'<embed[^>]+?src="(https?://(?:www\.)?vimeo.com/moogaloop.swf.+?)"', webpage)
+ r'<embed[^>]+?src="(https?://(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
if mobj:
return self.url_result(mobj.group(1), 'Vimeo')
return self.url_result(mobj.group(1), 'Aparat')
# Look for MPORA videos
- mobj = re.search(r'<iframe .*?src="(http://mpora\.com/videos/[^"]+)"', webpage)
+ mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
if mobj is not None:
return self.url_result(mobj.group(1), 'Mpora')
- # Look for embedded Novamov player
+ # Look for embedded NovaMov player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?novamov\.com/embed\.php.+?)\1', webpage)
if mobj is not None:
- return self.url_result(mobj.group('url'), 'Novamov')
+ return self.url_result(mobj.group('url'), 'NovaMov')
+
+ # Look for embedded NowVideo player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?nowvideo\.(?:ch|sx|eu)/embed\.php.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'NowVideo')
+
+ # Look for embedded Facebook player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Facebook')
+
+ # Look for embedded VK player
+ mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'VK')
+
+ # Look for embedded Huffington Post player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'HuffPost')
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
# Look for gorilla-vid style embedding
- mobj = re.search(r'(?s)jw_plugins.*?file:\s*["\'](.*?)["\']', webpage)
+ mobj = re.search(r'(?s)(?:jw_plugins|JWPlayerOptions).*?file\s*:\s*["\'](.*?)["\']', webpage)
if mobj is None:
# Broaden the search a little bit
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None:
# HTML5 video
mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
+ if mobj is None:
+ mobj = re.search(
+ r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
+ r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
+ webpage)
+ if mobj:
+ new_url = mobj.group(1)
+ self.report_following_redirect(new_url)
+ return {
+ '_type': 'url',
+ 'url': new_url,
+ }
if mobj is None:
raise ExtractorError('Unsupported URL: %s' % url)