X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgeneric.py;h=f3de738f765819da7cbda1332e9b6b6dcada5052;hb=c9c39c22c5740c1eedcc9ce7a10f5df199ea5c78;hp=8121f04a5e02cf672dc6ab0f152d21df8b98034e;hpb=3f2f4a94aa63e156e8ad19ea493f23e2afec6b62;p=youtube-dl diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8121f04a5..f3de738f7 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -59,6 +59,7 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .instagram import InstagramIE class GenericIE(InfoExtractor): @@ -239,6 +240,35 @@ class GenericIE(InfoExtractor): 'format': 'bestvideo', }, }, + # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 + { + 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', + 'info_dict': { + 'id': 'content', + 'ext': 'mp4', + 'title': 'content', + 'formats': 'mincount:8', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, + # m3u8 served with Content-Type: text/plain + { + 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', + 'info_dict': { + 'id': 'index', + 'ext': 'mp4', + 'title': 'index', + 'upload_date': '20140720', + 'formats': 'mincount:11', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', @@ -1094,7 +1124,23 @@ class GenericIE(InfoExtractor): # m3u8 downloads 'skip_download': True, } - } + }, + # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' + # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm + { + 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', + 'info_dict': { + 'id': '4785848093001', + 'ext': 'mp4', + 'title': 'The Cardinal Pell Interview', + 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', + 'uploader': 'GlobeCast Australia - GlobeStream', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, ] def report_following_redirect(self, new_url): @@ -1245,14 +1291,13 @@ class GenericIE(InfoExtractor): info_dict = { 'id': video_id, 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), + 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) } # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '') - m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P.+)$', content_type) + content_type = head_response.headers.get('Content-Type', '').lower() + m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type) if m: - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) format_id = m.group('format_id') if format_id.endswith('mpegurl'): formats = self._extract_m3u8_formats(url, video_id, 'mp4') @@ -1264,11 +1309,9 @@ class GenericIE(InfoExtractor): 'url': url, 'vcodec': 'none' if m.group('type') == 'audio' else None }] - info_dict.update({ - 'direct': True, - 'formats': formats, - 'upload_date': upload_date, - }) + info_dict['direct'] = True + self._sort_formats(formats) + info_dict['formats'] = formats return info_dict if not self._downloader.params.get('test', False) and not is_intentional: @@ -1289,18 +1332,22 @@ class GenericIE(InfoExtractor): request.add_header('Accept-Encoding', '*') full_response = self._request_webpage(request, video_id) + first_bytes = full_response.read(512) + + # Is it an M3U playlist? + if first_bytes.startswith(b'#EXTM3U'): + info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') + self._sort_formats(info_dict['formats']) + return info_dict + # Maybe it's a direct link to a video? # Be careful not to download the whole thing! - first_bytes = full_response.read(512) if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) info_dict.update({ 'direct': True, 'url': url, - 'upload_date': upload_date, }) return info_dict @@ -1315,15 +1362,19 @@ class GenericIE(InfoExtractor): if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): - return self._parse_smil(doc, url, video_id) + smil = self._parse_smil(doc, url, video_id) + self._sort_formats(smil['formats']) + return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': return self.playlist_result(self._parse_xspf(doc, video_id), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( doc, video_id, mpd_base_url=url.rpartition('/')[0]) + self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) + self._sort_formats(info_dict['formats']) return info_dict except compat_xml_parse_error: pass @@ -1881,6 +1932,19 @@ class GenericIE(InfoExtractor): self._proto_relative_url(unescapeHTML(mobj.group(1))), 'AdobeTVVideo') + # Look for Vine embeds + mobj = re.search( + r']+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') + + # Look for Instagram embeds + instagram_embed_url = InstagramIE._extract_embed_url(webpage) + if instagram_embed_url is not None: + return self.url_result(instagram_embed_url, InstagramIE.ie_key()) + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -1995,6 +2059,9 @@ class GenericIE(InfoExtractor): else: entry_info_dict['url'] = video_url + if entry_info_dict.get('formats'): + self._sort_formats(entry_info_dict['formats']) + entries.append(entry_info_dict) if len(entries) == 1: