Merge pull request #5961 from dstftw/force-generic-extractor
authorSergey M. <dstftw@gmail.com>
Wed, 24 Jun 2015 14:10:45 +0000 (19:10 +0500)
committerSergey M. <dstftw@gmail.com>
Wed, 24 Jun 2015 14:10:45 +0000 (19:10 +0500)
Add --force-generic-extractor

1  2 
youtube_dl/YoutubeDL.py
youtube_dl/extractor/generic.py
youtube_dl/options.py

diff --combined youtube_dl/YoutubeDL.py
index 6e4b6f56664f67a796b21f10b8d005f2f6e5b68d,a7d3a1c017fb6230639b522c1b36c356cfcaf93e..ef0f71bad45d6057dc99c1ce968629a0e357e57b
@@@ -119,7 -119,7 +119,7 @@@ class YoutubeDL(object)
  
      username:          Username for authentication purposes.
      password:          Password for authentication purposes.
 -    videopassword:     Password for acces a video.
 +    videopassword:     Password for accessing a video.
      usenetrc:          Use netrc for authentication instead.
      verbose:           Print additional info to stdout.
      quiet:             Do not print messages to stdout.
      outtmpl:           Template for output names.
      restrictfilenames: Do not allow "&" and spaces in file names
      ignoreerrors:      Do not stop on download errors.
+     force_generic_extractor: Force downloader to use the generic extractor
      nooverwrites:      Prevent overwriting files.
      playliststart:     Playlist item to start at.
      playlistend:       Playlist item to end at.
              info_dict.setdefault(key, value)
  
      def extract_info(self, url, download=True, ie_key=None, extra_info={},
-                      process=True):
+                      process=True, force_generic_extractor=False):
          '''
          Returns a list with a dictionary for each video we find.
          If 'download', also downloads the videos.
          extra_info is a dict containing the extra values to add to each result
          '''
  
+         if not ie_key and force_generic_extractor:
+             ie_key = 'Generic'
          if ie_key:
              ies = [self.get_info_extractor(ie_key)]
          else:
              info_dict['id'], info_dict.get('subtitles'),
              info_dict.get('automatic_captions'))
  
 -        # This extractors handle format selection themselves
 -        if info_dict['extractor'] in ['Youku']:
 -            if download:
 -                self.process_info(info_dict)
 -            return info_dict
 -
          # We now pick which formats have to be downloaded
          if info_dict.get('formats') is None:
              # There's only one format available
          for url in url_list:
              try:
                  # It also downloads the videos
-                 res = self.extract_info(url)
+                 res = self.extract_info(
+                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
              except UnavailableVideoError:
                  self.report_error('unable to download video')
              except MaxDownloadsReached:
index 5c03fddc6a2ee548a6617a3aea9ba161f6b3777d,c8582bda97f2b9704c3d53d53490296f0f8df0ac..7769ffc5c5f425ce04dc92147c77803291f0fdd5
@@@ -42,10 -42,6 +42,10 @@@ from .udn import UDNEmbedI
  from .senateisvp import SenateISVPIE
  from .bliptv import BlipTVIE
  from .svt import SVTIE
 +from .pornhub import PornHubIE
 +from .xhamster import XHamsterEmbedIE
 +from .vimeo import VimeoIE
 +from .dailymotion import DailymotionCloudIE
  
  
  class GenericIE(InfoExtractor):
                  'skip_download': True,
              },
          },
 +        # XHamster embed
 +        {
 +            'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
 +            'info_dict': {
 +                'id': 'showthread',
 +                'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
 +            },
 +            'playlist_mincount': 7,
 +        },
          # Embedded TED video
          {
              'url': 'http://en.support.wordpress.com/videos/ted-talks/',
                  'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
                  'uploader': 'Rogers Sportsnet',
              },
 +        },
 +        # Dailymotion Cloud video
 +        {
 +            'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
 +            'md5': '49444254273501a64675a7e68c502681',
 +            'info_dict': {
 +                'id': '5585de919473990de4bee11b',
 +                'ext': 'mp4',
 +                'title': 'Le débat',
 +                'thumbnail': 're:^https?://.*\.jpe?g$',
 +            }
 +        },
 +        # AdobeTVVideo embed
 +        {
 +            'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
 +            'md5': '43662b577c018ad707a63766462b1e87',
 +            'info_dict': {
 +                'id': '2456',
 +                'ext': 'mp4',
 +                'title': 'New experience with Acrobat DC',
 +                'description': 'New experience with Acrobat DC',
 +                'duration': 248.667,
 +            },
          }
      ]
  
              }
  
          if not self._downloader.params.get('test', False) and not is_intentional:
-             self._downloader.report_warning('Falling back on generic information extractor.')
+             force = self._downloader.params.get('force_generic_extractor', False)
+             self._downloader.report_warning(
+                 '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
  
          if not full_response:
              request = compat_urllib_request.Request(url)
          if matches:
              return _playlist_from_matches(matches, ie='RtlNl')
  
 -        # Look for embedded (iframe) Vimeo player
 -        mobj = re.search(
 -            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
 -        if mobj:
 -            player_url = unescapeHTML(mobj.group('url'))
 -            surl = smuggle_url(player_url, {'Referer': url})
 -            return self.url_result(surl)
 -        # Look for embedded (swf embed) Vimeo player
 -        mobj = re.search(
 -            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
 -        if mobj:
 -            return self.url_result(mobj.group(1))
 +        vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
 +        if vimeo_url is not None:
 +            return self.url_result(vimeo_url)
  
          # Look for embedded YouTube player
          matches = re.findall(r'''(?x)
          if sportbox_urls:
              return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
  
 +        # Look for embedded PornHub player
 +        pornhub_url = PornHubIE._extract_url(webpage)
 +        if pornhub_url:
 +            return self.url_result(pornhub_url, 'PornHub')
 +
 +        # Look for embedded XHamster player
 +        xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
 +        if xhamster_urls:
 +            return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
 +
          # Look for embedded Tvigle player
          mobj = re.search(
              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
          if senate_isvp_url:
              return self.url_result(senate_isvp_url, 'SenateISVP')
  
 +        # Look for Dailymotion Cloud videos
 +        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
 +        if dmcloud_url:
 +            return self.url_result(dmcloud_url, 'DailymotionCloud')
 +
 +        # Look for AdobeTVVideo embeds
 +        mobj = re.search(
 +            r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
 +            webpage)
 +        if mobj is not None:
 +            return self.url_result(
 +                self._proto_relative_url(unescapeHTML(mobj.group(1))),
 +                'AdobeTVVideo')
 +
          def check_video(vurl):
              if YoutubeIE.suitable(vurl):
                  return True
diff --combined youtube_dl/options.py
index 740458e51483f45f8d8474d68edaaac48b24941e,096ab6137460e2298a5a1fa17eb81a36de65be32..6aeca61ee5e099e80a5d98a893afb8adde72e1cd
@@@ -150,6 -150,10 +150,10 @@@ def parseOpts(overrideArguments=None)
          '--extractor-descriptions',
          action='store_true', dest='list_extractor_descriptions', default=False,
          help='Output descriptions of all supported extractors')
+     general.add_option(
+         '--force-generic-extractor',
+         action='store_true', dest='force_generic_extractor', default=False,
+         help='Force extraction to use the generic extractor')
      general.add_option(
          '--default-search',
          dest='default_search', metavar='PREFIX',
          metavar='POLICY', dest='fixup', default='detect_or_warn',
          help='Automatically correct known faults of the file. '
               'One of never (do nothing), warn (only emit a warning), '
 -             'detect_or_warn(the default; fix file if we can, warn otherwise)')
 +             'detect_or_warn (the default; fix file if we can, warn otherwise)')
      postproc.add_option(
          '--prefer-avconv',
          action='store_false', dest='prefer_ffmpeg',