Merge remote-tracking branch 'dstftw/generic-webpage-unescape'
[youtube-dl] / youtube_dl / extractor / generic.py
index 30160d59d4218601a7bff538f6fd96cc130bd7a4..23891325677e40165252788bba7a47e4cee4f91c 100644 (file)
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
 
 import os
 import re
-import xml.etree.ElementTree
 
 from .common import InfoExtractor
 from .youtube import YoutubeIE
@@ -13,9 +12,11 @@ from ..utils import (
     compat_urllib_parse,
     compat_urllib_request,
     compat_urlparse,
+    compat_xml_parse_error,
 
     ExtractorError,
     HEADRequest,
+    parse_xml,
     smuggle_url,
     unescapeHTML,
     unified_strdate,
@@ -23,6 +24,7 @@ from ..utils import (
 )
 from .brightcove import BrightcoveIE
 from .ooyala import OoyalaIE
+from .rutv import RUTVIE
 
 
 class GenericIE(InfoExtractor):
@@ -82,10 +84,10 @@ class GenericIE(InfoExtractor):
         # Direct link to a video
         {
             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
-            'file': 'trailer.mp4',
             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
             'info_dict': {
                 'id': 'trailer',
+                'ext': 'mp4',
                 'title': 'trailer',
                 'upload_date': '20100513',
             }
@@ -93,7 +95,6 @@ class GenericIE(InfoExtractor):
         # ooyala video
         {
             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
-            'file': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4',
             'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
             'info_dict': {
                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
@@ -101,6 +102,101 @@ class GenericIE(InfoExtractor):
                 'title': '2cc213299525360.mov',  # that's what we get
             },
         },
+        # second style of embedded ooyala videos
+        {
+            'url': 'http://www.smh.com.au/tv/business/show/financial-review-sunday/behind-the-scenes-financial-review-sunday--4350201.html',
+            'info_dict': {
+                'id': '13djJjYjptA1XpPx8r9kuzPyj3UZH0Uk',
+                'ext': 'mp4',
+                'title': 'Behind-the-scenes: Financial Review Sunday ',
+                'description': 'Step inside Channel Nine studios for an exclusive tour of its upcoming financial business show.',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        # google redirect
+        {
+            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+            'info_dict': {
+                'id': 'cmQHVoWB5FY',
+                'ext': 'mp4',
+                'upload_date': '20130224',
+                'uploader_id': 'TheVerge',
+                'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
+                'uploader': 'The Verge',
+                'title': 'First Firefox OS phones side-by-side',
+            },
+            'params': {
+                'skip_download': False,
+            }
+        },
+        # embed.ly video
+        {
+            'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
+            'info_dict': {
+                'id': '9ODmcdjQcHQ',
+                'ext': 'mp4',
+                'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
+                'upload_date': '20140225',
+                'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
+                'uploader': 'Tested',
+                'uploader_id': 'testedcom',
+            },
+            # No need to test YoutubeIE here
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # funnyordie embed
+        {
+            'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
+            'md5': '7cf780be104d40fea7bae52eed4a470e',
+            'info_dict': {
+                'id': '18e820ec3f',
+                'ext': 'mp4',
+                'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
+                'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
+            },
+        },
+        # RUTV embed
+        {
+            'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
+            'info_dict': {
+                'id': '776940',
+                'ext': 'mp4',
+                'title': 'Охотское море стало целиком российским',
+                'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        # Embedded TED video
+        {
+            'url': 'http://en.support.wordpress.com/videos/ted-talks/',
+            'md5': 'deeeabcc1085eb2ba205474e7235a3d5',
+            'info_dict': {
+                'id': '981',
+                'ext': 'mp4',
+                'title': 'My web playroom',
+                'uploader': 'Ze Frank',
+                'description': 'md5:ddb2a40ecd6b6a147e400e535874947b',
+            }
+        },
+        # nowvideo embed hidden behind percent encoding
+        {
+            'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
+            'md5': '2baf4ddd70f697d94b1c18cf796d5107',
+            'info_dict': {
+                'id': '06e53103ca9aa',
+                'ext': 'flv',
+                'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
+                'description': 'No description',
+            },
+        },
     ]
 
     def report_download_webpage(self, video_id):
@@ -126,9 +222,14 @@ class GenericIE(InfoExtractor):
                     newurl = newurl.replace(' ', '%20')
                     newheaders = dict((k,v) for k,v in req.headers.items()
                                       if k.lower() not in ("content-length", "content-type"))
+                    try:
+                        # This function was deprecated in python 3.3 and removed in 3.4
+                        origin_req_host = req.get_origin_req_host()
+                    except AttributeError:
+                        origin_req_host = req.origin_req_host
                     return HEADRequest(newurl,
                                        headers=newheaders,
-                                       origin_req_host=req.get_origin_req_host(),
+                                       origin_req_host=origin_req_host,
                                        unverifiable=True)
                 else:
                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
@@ -195,7 +296,7 @@ class GenericIE(InfoExtractor):
             else:
                 assert ':' in default_search
                 return self.url_result(default_search + url)
-        video_id = os.path.splitext(url.split('/')[-1])[0]
+        video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
 
         self.to_screen('%s: Requesting header' % video_id)
 
@@ -241,12 +342,17 @@ class GenericIE(InfoExtractor):
 
         # Is it an RSS feed?
         try:
-            doc = xml.etree.ElementTree.fromstring(webpage)
+            doc = parse_xml(webpage)
             if doc.tag == 'rss':
                 return self._extract_rss(url, video_id, doc)
-        except xml.etree.ElementTree.ParseError:
+        except compat_xml_parse_error:
             pass
 
+        # Sometimes embedded video player is hidden behind percent encoding
+        # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
+        # Unescaping the whole page allows to handle those cases in a generic way
+        webpage = compat_urllib_parse.unquote(webpage)
+
         # it's tempting to parse this further, but you would
         # have to take into account all the variations like
         #   Video Title - Site Name
@@ -280,9 +386,9 @@ class GenericIE(InfoExtractor):
 
         # Look for embedded (iframe) Vimeo player
         mobj = re.search(
-            r'<iframe[^>]+?src="((?:https?:)?//player\.vimeo\.com/video/.+?)"', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
         if mobj:
-            player_url = unescapeHTML(mobj.group(1))
+            player_url = unescapeHTML(mobj.group('url'))
             surl = smuggle_url(player_url, {'Referer': url})
             return self.url_result(surl, 'Vimeo')
 
@@ -348,9 +454,10 @@ class GenericIE(InfoExtractor):
             return self.url_result(mobj.group('url'))
 
         # Look for Ooyala videos
-        mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage)
+        mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
+             re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
         if mobj is not None:
-            return OoyalaIE._build_url_result(mobj.group(1))
+            return OoyalaIE._build_url_result(mobj.group('ec'))
 
         # Look for Aparat videos
         mobj = re.search(r'<iframe src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
@@ -362,11 +469,17 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group(1), 'Mpora')
 
-        # Look for embedded Novamov player
+        # Look for embedded NovaMov player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?novamov\.com/embed\.php.+?)\1', webpage)
         if mobj is not None:
-            return self.url_result(mobj.group('url'), 'Novamov')
+            return self.url_result(mobj.group('url'), 'NovaMov')
+
+        # Look for embedded NowVideo player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?nowvideo\.(?:ch|sx|eu)/embed\.php.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'NowVideo')
 
         # Look for embedded Facebook player
         mobj = re.search(
@@ -374,12 +487,38 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'Facebook')
 
+        # Look for embedded VK player
+        mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'VK')
+
         # Look for embedded Huffington Post player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'HuffPost')
 
+        # Look for embed.ly
+        mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+        mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
+        if mobj is not None:
+            return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
+
+        # Look for funnyordie embed
+        matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
+        if matches:
+            urlrs = [self.url_result(unescapeHTML(eurl), 'FunnyOrDie')
+                     for eurl in matches]
+            return self.playlist_result(
+                urlrs, playlist_id=video_id, playlist_title=video_title)
+
+        # Look for embedded RUTV player
+        rutv_url = RUTVIE._extract_url(webpage)
+        if rutv_url:
+            return self.url_result(rutv_url, 'RUTV')
+
         # Start with something easy: JW Player in SWFObject
         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
         if mobj is None:
@@ -391,6 +530,13 @@ class GenericIE(InfoExtractor):
         if mobj is None:
             # Broaden the search a little bit: JWPlayer JS loader
             mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
+
+        # Look for embedded TED player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'TED')
+
         if mobj is None:
             # Try to find twitter cards info
             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
@@ -404,6 +550,18 @@ class GenericIE(InfoExtractor):
         if mobj is None:
             # HTML5 video
             mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
+        if mobj is None:
+            mobj = re.search(
+                r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
+                r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
+                webpage)
+            if mobj:
+                new_url = mobj.group(1)
+                self.report_following_redirect(new_url)
+                return {
+                    '_type': 'url',
+                    'url': new_url,
+                }
         if mobj is None:
             raise ExtractorError('Unsupported URL: %s' % url)