[generic] Add support for LiveLeak embeds

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 12f2309fc98fb99d56c961621c863fb92cb99cc9..5b22b6b5e4568fbe60845491845c468f3606d262 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -60,6 +60,7 @@ from .googledrive import GoogleDriveIE
  from .jwplatform import JWPlatformIE
  from .digiteka import DigitekaIE
  from .instagram import InstagramIE
+from .liveleak import LiveLeakIE
  
  
  class GenericIE(InfoExtractor):
@@ -406,19 +407,6 @@ class GenericIE(InfoExtractor):
                  'skip_download': True,
              },
          },
-        # multiple ooyala embeds on SBN network websites
-        {
-            'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
-            'info_dict': {
-                'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
-                'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
-            },
-            'playlist_mincount': 3,
-            'params': {
-                'skip_download': True,
-            },
-            'add_ie': ['Ooyala'],
-        },
          # embed.ly video
          {
              'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -1124,7 +1112,47 @@ class GenericIE(InfoExtractor):
                  # m3u8 downloads
                  'skip_download': True,
              }
-        }
+        },
+        # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions'
+        # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm
+        {
+            'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html',
+            'info_dict': {
+                'id': '4785848093001',
+                'ext': 'mp4',
+                'title': 'The Cardinal Pell Interview',
+                'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ',
+                'uploader': 'GlobeCast Australia - GlobeStream',
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            },
+        },
+        # Another form of arte.tv embed
+        {
+            'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html',
+            'md5': '850bfe45417ddf221288c88a0cffe2e2',
+            'info_dict': {
+                'id': '030273-562_PLUS7-F',
+                'ext': 'mp4',
+                'title': 'ARTE Reportage - Nulle part, en France',
+                'description': 'md5:e3a0e8868ed7303ed509b9e3af2b870d',
+                'upload_date': '20160409',
+            },
+        },
+        # LiveLeak embed
+        {
+            'url': 'http://www.wykop.pl/link/3088787/',
+            'md5': 'ace83b9ed19b21f68e1b50e844fdf95d',
+            'info_dict': {
+                'id': '874_1459135191',
+                'ext': 'mp4',
+                'title': 'Man shows poor quality of new apartment building',
+                'description': 'The wall is like a sand pile.',
+                'uploader': 'Lake8737',
+            }
+        },
      ]
  
      def report_following_redirect(self, new_url):
@@ -1294,6 +1322,7 @@ class GenericIE(InfoExtractor):
                      'vcodec': 'none' if m.group('type') == 'audio' else None
                  }]
                  info_dict['direct'] = True
+            self._sort_formats(formats)
              info_dict['formats'] = formats
              return info_dict
  
@@ -1320,6 +1349,7 @@ class GenericIE(InfoExtractor):
          # Is it an M3U playlist?
          if first_bytes.startswith(b'#EXTM3U'):
              info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4')
+            self._sort_formats(info_dict['formats'])
              return info_dict
  
          # Maybe it's a direct link to a video?
@@ -1344,15 +1374,19 @@ class GenericIE(InfoExtractor):
              if doc.tag == 'rss':
                  return self._extract_rss(url, video_id, doc)
              elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
-                return self._parse_smil(doc, url, video_id)
+                smil = self._parse_smil(doc, url, video_id)
+                self._sort_formats(smil['formats'])
+                return smil
              elif doc.tag == '{http://xspf.org/ns/0/}playlist':
                  return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
                  info_dict['formats'] = self._parse_mpd_formats(
                      doc, video_id, mpd_base_url=url.rpartition('/')[0])
+                self._sort_formats(info_dict['formats'])
                  return info_dict
              elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
                  info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
+                self._sort_formats(info_dict['formats'])
                  return info_dict
          except compat_xml_parse_error:
              pass
@@ -1693,7 +1727,7 @@ class GenericIE(InfoExtractor):
  
          # Look for embedded arte.tv player
          mobj = re.search(
-            r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
+            r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"',
              webpage)
          if mobj is not None:
              return self.url_result(mobj.group('url'), 'ArteTVEmbed')
@@ -1923,6 +1957,11 @@ class GenericIE(InfoExtractor):
          if instagram_embed_url is not None:
              return self.url_result(instagram_embed_url, InstagramIE.ie_key())
  
+        # Look for LiveLeak embeds
+        liveleak_url = LiveLeakIE._extract_url(webpage)
+        if liveleak_url:
+            return self.url_result(liveleak_url, 'LiveLeak')
+
          def check_video(vurl):
              if YoutubeIE.suitable(vurl):
                  return True
@@ -2037,6 +2076,9 @@ class GenericIE(InfoExtractor):
              else:
                  entry_info_dict['url'] = video_url
  
+            if entry_info_dict.get('formats'):
+                self._sort_formats(entry_info_dict['formats'])
+
              entries.append(entry_info_dict)
  
          if len(entries) == 1: