Fix some regexes

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 919f4f987eb6f5af3963656329ca9fc1c540d9a8..68b6338396dc84277792273db277154fe3cc4995 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -22,6 +22,8 @@ from ..utils import (
      HEADRequest,
      is_html,
      js_to_json,
+    KNOWN_EXTENSIONS,
+    mimetype2ext,
      orderedSet,
      sanitized_Request,
      smuggle_url,
@@ -36,6 +38,10 @@ from .brightcove import (
      BrightcoveLegacyIE,
      BrightcoveNewIE,
  )
+from .nexx import (
+    NexxIE,
+    NexxEmbedIE,
+)
  from .nbc import NBCSportsVPlayerIE
  from .ooyala import OoyalaIE
  from .rutv import RUTVIE
@@ -57,6 +63,7 @@ from .dailymotion import (
      DailymotionIE,
      DailymotionCloudIE,
  )
+from .dailymail import DailyMailIE
  from .onionstudios import OnionStudiosIE
  from .viewlift import ViewLiftEmbedIE
  from .mtv import MTVServicesEmbeddedIE
@@ -91,6 +98,9 @@ from .anvato import AnvatoIE
  from .washingtonpost import WashingtonPostIE
  from .wistia import WistiaIE
  from .mediaset import MediasetIE
+from .joj import JojIE
+from .megaphone import MegaphoneIE
+from .vzaar import VzaarIE
  
  
  class GenericIE(InfoExtractor):
@@ -568,6 +578,19 @@ class GenericIE(InfoExtractor):
              },
              'skip': 'movie expired',
          },
+        # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js
+        {
+            'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/',
+            'info_dict': {
+                'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2',
+                'ext': 'mp4',
+                'title': 'Steampunk Fest Comes to Honesdale',
+                'duration': 43.276,
+            },
+            'params': {
+                'skip_download': True,
+            }
+        },
          # embed.ly video
          {
              'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -759,6 +782,20 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': ['Dailymotion'],
          },
+        # DailyMail embed
+        {
+            'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot',
+            'info_dict': {
+                'id': '1495629',
+                'ext': 'mp4',
+                'title': 'Care worker punches elderly dementia patient in head 11 times',
+                'description': 'md5:3a743dee84e57e48ec68bf67113199a5',
+            },
+            'add_ie': ['DailyMail'],
+            'params': {
+                'skip_download': True,
+            },
+        },
          # YouTube embed
          {
              'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
@@ -1095,6 +1132,35 @@ class GenericIE(InfoExtractor):
                  'skip_download': True,
              }
          },
+        {
+            # Video.js embed, multiple formats
+            'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
+            'info_dict': {
+                'id': 'yygqldloqIk',
+                'ext': 'mp4',
+                'title': 'SolidWorks. Урок 6 Настройка чертежа',
+                'description': 'md5:baf95267792646afdbf030e4d06b2ab3',
+                'upload_date': '20130314',
+                'uploader': 'PROстое3D',
+                'uploader_id': 'PROstoe3D',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # Video.js embed, single format
+            'url': 'https://www.vooplayer.com/v3/watch/watch.php?v=NzgwNTg=',
+            'info_dict': {
+                'id': 'watch',
+                'ext': 'mp4',
+                'title': 'Step 1 -  Good Foundation',
+                'description': 'md5:d1e7ff33a29fc3eb1673d6c270d344f4',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
          # rtl.nl embed
          {
              'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
@@ -1484,14 +1550,27 @@ class GenericIE(InfoExtractor):
          # LiveLeak embed
          {
              'url': 'http://www.wykop.pl/link/3088787/',
-            'md5': 'ace83b9ed19b21f68e1b50e844fdf95d',
+            'md5': '7619da8c820e835bef21a1efa2a0fc71',
              'info_dict': {
                  'id': '874_1459135191',
                  'ext': 'mp4',
                  'title': 'Man shows poor quality of new apartment building',
                  'description': 'The wall is like a sand pile.',
                  'uploader': 'Lake8737',
-            }
+            },
+            'add_ie': [LiveLeakIE.ie_key()],
+        },
+        # Another LiveLeak embed pattern (#13336)
+        {
+            'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/',
+            'info_dict': {
+                'id': '2eb_1496309988',
+                'ext': 'mp4',
+                'title': 'Thief robs place where everyone was armed',
+                'description': 'md5:694d73ee79e535953cf2488562288eee',
+                'uploader': 'brazilwtf',
+            },
+            'add_ie': [LiveLeakIE.ie_key()],
          },
          # Duplicated embedded video URLs
          {
@@ -1533,6 +1612,22 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': ['BrightcoveLegacy'],
          },
+        # Nexx embed
+        {
+            'url': 'https://www.funk.net/serien/5940e15073f6120001657956/items/593efbb173f6120001657503',
+            'info_dict': {
+                'id': '247746',
+                'ext': 'mp4',
+                'title': "Yesterday's Jam (OV)",
+                'description': 'md5:09bc0984723fed34e2581624a84e05f0',
+                'timestamp': 1492594816,
+                'upload_date': '20170419',
+            },
+            'params': {
+                'format': 'bestvideo',
+                'skip_download': True,
+            },
+        },
          # Facebook <iframe> embed
          {
              'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
@@ -1734,6 +1829,21 @@ class GenericIE(InfoExtractor):
              },
              'playlist_mincount': 5,
          },
+        {
+            # Limelight embed (LimelightPlayerUtil.embed)
+            'url': 'https://tv5.ca/videos?v=xuu8qowr291ri',
+            'info_dict': {
+                'id': '95d035dc5c8a401588e9c0e6bd1e9c92',
+                'ext': 'mp4',
+                'title': '07448641',
+                'timestamp': 1499890639,
+                'upload_date': '20170712',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': ['LimelightMedia'],
+        },
          {
              'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
              'info_dict': {
@@ -1770,6 +1880,16 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': [MediasetIE.ie_key()],
          },
+        {
+            # JOJ.sk embeds
+            'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok',
+            'info_dict': {
+                'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok',
+                'title': 'Slovenskom sa prehnala vlna silných búrok',
+            },
+            'playlist_mincount': 5,
+            'add_ie': [JojIE.ie_key()],
+        },
          {
              # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video)
              'url': 'https://tvrain.ru/amp/418921/',
@@ -1780,6 +1900,25 @@ class GenericIE(InfoExtractor):
                  'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
              },
          },
+        {
+            # vzaar embed
+            'url': 'http://help.vzaar.com/article/165-embedding-video',
+            'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4',
+            'info_dict': {
+                'id': '8707641',
+                'ext': 'mp4',
+                'title': 'Building A Business Online: Principal Chairs Q & A',
+            },
+        },
+        {
+            # multiple HTML5 videos on one page
+            'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html',
+            'info_dict': {
+                'id': 'keyscenarios',
+                'title': 'Rescue Kit 14 Free Edition - Getting started',
+            },
+            'playlist_count': 4,
+        }
          # {
          #     # TODO: find another test
          #     # http://schema.org/VideoObject
@@ -1929,7 +2068,7 @@ class GenericIE(InfoExtractor):
  
          if head_response is not False:
              # Check for redirect
-            new_url = head_response.geturl()
+            new_url = compat_str(head_response.geturl())
              if url != new_url:
                  self.report_following_redirect(new_url)
                  if force_videoid:
@@ -2030,7 +2169,7 @@ class GenericIE(InfoExtractor):
              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
                  info_dict['formats'] = self._parse_mpd_formats(
                      doc, video_id,
-                    mpd_base_url=full_response.geturl().rpartition('/')[0],
+                    mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0],
                      mpd_url=url)
                  self._sort_formats(info_dict['formats'])
                  return info_dict
@@ -2067,7 +2206,7 @@ class GenericIE(InfoExtractor):
          # And then there are the jokers who advertise that they use RTA,
          # but actually don't.
          AGE_LIMIT_MARKERS = [
-            r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
+            r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
          ]
          if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
              age_limit = 18
@@ -2107,6 +2246,16 @@ class GenericIE(InfoExtractor):
          if bc_urls:
              return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
  
+        # Look for Nexx embeds
+        nexx_urls = NexxIE._extract_urls(webpage)
+        if nexx_urls:
+            return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key())
+
+        # Look for Nexx iFrame embeds
+        nexx_embed_urls = NexxEmbedIE._extract_urls(webpage)
+        if nexx_embed_urls:
+            return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key())
+
          # Look for ThePlatform embeds
          tp_urls = ThePlatformIE._extract_urls(webpage)
          if tp_urls:
@@ -2134,36 +2283,11 @@ class GenericIE(InfoExtractor):
          if vid_me_embed_url is not None:
              return self.url_result(vid_me_embed_url, 'Vidme')
  
-        # Look for embedded YouTube player
-        matches = re.findall(r'''(?x)
-            (?:
-                <iframe[^>]+?src=|
-                data-video-url=|
-                <embed[^>]+?src=|
-                embedSWF\(?:\s*|
-                <object[^>]+data=|
-                new\s+SWFObject\(
-            )
-            (["\'])
-                (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
-                (?:embed|v|p)/.+?)
-            \1''', webpage)
-        if matches:
+        # Look for YouTube embeds
+        youtube_urls = YoutubeIE._extract_urls(webpage)
+        if youtube_urls:
              return self.playlist_from_matches(
-                matches, video_id, video_title, lambda m: unescapeHTML(m[1]))
-
-        # Look for lazyYT YouTube embed
-        matches = re.findall(
-            r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
-        if matches:
-            return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))
-
-        # Look for Wordpress "YouTube Video Importer" plugin
-        matches = re.findall(r'''(?x)<div[^>]+
-            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
-            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
-        if matches:
-            return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])
+                youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key())
  
          matches = DailymotionIE._extract_urls(webpage)
          if matches:
@@ -2179,6 +2303,12 @@ class GenericIE(InfoExtractor):
                  return self.playlist_from_matches(
                      playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
  
+        # Look for DailyMail embeds
+        dailymail_urls = DailyMailIE._extract_urls(webpage)
+        if dailymail_urls:
+            return self.playlist_from_matches(
+                dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key())
+
          # Look for embedded Wistia player
          wistia_url = WistiaIE._extract_url(webpage)
          if wistia_url:
@@ -2230,6 +2360,7 @@ class GenericIE(InfoExtractor):
          # Look for Ooyala videos
          mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
                  re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
+                re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
                  re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
                  re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
          if mobj is not None:
@@ -2654,9 +2785,9 @@ class GenericIE(InfoExtractor):
                  self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())
  
          # Look for LiveLeak embeds
-        liveleak_url = LiveLeakIE._extract_url(webpage)
-        if liveleak_url:
-            return self.url_result(liveleak_url, 'LiveLeak')
+        liveleak_urls = LiveLeakIE._extract_urls(webpage)
+        if liveleak_urls:
+            return self.playlist_from_matches(liveleak_urls, video_id, video_title)
  
          # Look for 3Q SDN embeds
          threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
@@ -2708,7 +2839,7 @@ class GenericIE(InfoExtractor):
          rutube_urls = RutubeIE._extract_urls(webpage)
          if rutube_urls:
              return self.playlist_from_matches(
-                rutube_urls, ie=RutubeIE.ie_key())
+                rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
  
          # Look for WashingtonPost embeds
          wapo_urls = WashingtonPostIE._extract_urls(webpage)
@@ -2722,6 +2853,24 @@ class GenericIE(InfoExtractor):
              return self.playlist_from_matches(
                  mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
  
+        # Look for JOJ.sk embeds
+        joj_urls = JojIE._extract_urls(webpage)
+        if joj_urls:
+            return self.playlist_from_matches(
+                joj_urls, video_id, video_title, ie=JojIE.ie_key())
+
+        # Look for megaphone.fm embeds
+        mpfn_urls = MegaphoneIE._extract_urls(webpage)
+        if mpfn_urls:
+            return self.playlist_from_matches(
+                mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key())
+
+        # Look for vzaar embeds
+        vzaar_urls = VzaarIE._extract_urls(webpage)
+        if vzaar_urls:
+            return self.playlist_from_matches(
+                vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key())
+
          def merge_dicts(dict1, dict2):
              merged = {}
              for k, v in dict1.items():
@@ -2737,22 +2886,23 @@ class GenericIE(InfoExtractor):
                      merged[k] = v
              return merged
  
-        # Looking for http://schema.org/VideoObject
-        json_ld = self._search_json_ld(
-            webpage, video_id, default={}, expected_type='VideoObject')
-        if json_ld.get('url'):
-            return merge_dicts(json_ld, info_dict)
-
          # Look for HTML5 media
          entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
          if entries:
-            for entry in entries:
-                entry.update({
+            if len(entries) == 1:
+                entries[0].update({
                      'id': video_id,
                      'title': video_title,
                  })
+            else:
+                for num, entry in enumerate(entries, start=1):
+                    entry.update({
+                        'id': '%s-%s' % (video_id, num),
+                        'title': '%s (%d)' % (video_title, num),
+                    })
+            for entry in entries:
                  self._sort_formats(entry['formats'])
-            return self.playlist_result(entries)
+            return self.playlist_result(entries, video_id, video_title)
  
          jwplayer_data = self._find_jwplayer_data(
              webpage, video_id, transform_source=js_to_json)
@@ -2761,6 +2911,52 @@ class GenericIE(InfoExtractor):
                  jwplayer_data, video_id, require_title=False, base_url=url)
              return merge_dicts(info, info_dict)
  
+        # Video.js embed
+        mobj = re.search(
+            r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',
+            webpage)
+        if mobj is not None:
+            sources = self._parse_json(
+                mobj.group(1), video_id, transform_source=js_to_json,
+                fatal=False) or []
+            if not isinstance(sources, list):
+                sources = [sources]
+            formats = []
+            for source in sources:
+                src = source.get('src')
+                if not src or not isinstance(src, compat_str):
+                    continue
+                src = compat_urlparse.urljoin(url, src)
+                src_type = source.get('type')
+                if isinstance(src_type, compat_str):
+                    src_type = src_type.lower()
+                ext = determine_ext(src).lower()
+                if src_type == 'video/youtube':
+                    return self.url_result(src, YoutubeIE.ie_key())
+                if src_type == 'application/dash+xml' or ext == 'mpd':
+                    formats.extend(self._extract_mpd_formats(
+                        src, video_id, mpd_id='dash', fatal=False))
+                elif src_type == 'application/x-mpegurl' or ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        src, video_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id='hls', fatal=False))
+                else:
+                    formats.append({
+                        'url': src,
+                        'ext': (mimetype2ext(src_type) or
+                                ext if ext in KNOWN_EXTENSIONS else 'mp4'),
+                    })
+            if formats:
+                self._sort_formats(formats)
+                info_dict['formats'] = formats
+                return info_dict
+
+        # Looking for http://schema.org/VideoObject
+        json_ld = self._search_json_ld(
+            webpage, video_id, default={}, expected_type='VideoObject')
+        if json_ld.get('url'):
+            return merge_dicts(json_ld, info_dict)
+
          def check_video(vurl):
              if YoutubeIE.suitable(vurl):
                  return True
@@ -2848,7 +3044,7 @@ class GenericIE(InfoExtractor):
              # be supported by youtube-dl thus this is checked the very last (see
              # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
              embed_url = self._html_search_meta('twitter:player', webpage, default=None)
-            if embed_url:
+            if embed_url and embed_url != url:
                  return self.url_result(embed_url)
  
          if not found: