Better fix for getting source url's

[youtube-dl] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index 79c4f4b9ebc51181d1e14fac21f61ed4e2fb92c3..325c5ecd4c580018fd168f3812c3aafb885e5c57 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -124,8 +124,8 @@ class InfoExtractor(object):
                  errnote = u'Unable to download webpage'
              raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
  
-    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
-        """ Returns the data of the page as a string """
+    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
+        """ Returns a tuple (page content as string, URL handle) """
          urlh = self._request_webpage(url_or_request, video_id, note, errnote)
          content_type = urlh.headers.get('Content-Type', '')
          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -142,7 +142,12 @@ class InfoExtractor(object):
              self.to_screen(u'Dumping request to ' + url)
              dump = base64.b64encode(webpage_bytes).decode('ascii')
              self._downloader.to_screen(dump)
-        return webpage_bytes.decode(encoding, 'replace')
+        content = webpage_bytes.decode(encoding, 'replace')
+        return (content, urlh)
+
+    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
+        """ Returns the data of the page as a string """
+        return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
  
      def to_screen(self, msg):
          """Print msg to screen, prefixing it with '[ie_name]'"""
@@ -206,7 +211,7 @@ class YoutubeIE(InfoExtractor):
                       ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
                       (?(1).+)?                                                # if we found the ID, everything can follow
                       $"""
-    _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
+    _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
      _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
      _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
      _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
@@ -422,7 +427,7 @@ class YoutubeIE(InfoExtractor):
  
          # Log in
          login_form_strs = {
-                u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
+                u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
                  u'Email': username,
                  u'GALX': galx,
                  u'Passwd': password,
@@ -482,12 +487,12 @@ class YoutubeIE(InfoExtractor):
          # Extract original video URL from URL with redirection, like age verification, using next_url parameter
          mobj = re.search(self._NEXT_URL_RE, url)
          if mobj:
-            url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
+            url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
          video_id = self._extract_id(url)
  
          # Get video webpage
          self.report_video_webpage_download(video_id)
-        url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
+        url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
          request = compat_urllib_request.Request(url)
          try:
              video_webpage_bytes = compat_urllib_request.urlopen(request).read()
@@ -779,39 +784,6 @@ class MetacafeIE(InfoExtractor):
              'ext':      video_extension.decode('utf-8'),
          }]
  
-class RedtubeIE(InfoExtractor):
-    """Information Extractor for redtube"""
-    _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
-    IE_NAME = u'redtube'
-
-    def _real_extract(self,url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            self._downloader.report_error(u'invalid URL: %s' % url)
-            return
-        video_id = mobj.group('id')
-        video_extension = 'mp4'        
-        webpage = self._download_webpage(url, video_id)
-        self.report_extraction(video_id)
-        mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
-        if mobj is not None:
-            video_url = mobj.group(1)
-        else:
-            self._downloader.report_error(u'unable to extract media URL')
-            return
-        mobj = re.search('<h1 class="videoTitle slidePanelMovable">'+r'(.+)'+r'</h1>',webpage)
-        if mobj is not None:
-            video_title = mobj.group(1)
-        else:
-            video_title = 'Redtube - %s' % time.ctime()
-
-        return [{
-            'id':       video_id,
-            'url':      video_url,
-            'ext':      video_extension,
-            'title':    video_title,
-        }]
-
  class DailymotionIE(InfoExtractor):
      """Information Extractor for Dailymotion"""
  
@@ -2298,16 +2270,14 @@ class ComedyCentralIE(InfoExtractor):
                  epTitle = mobj.group('episode')
  
          self.report_extraction(epTitle)
-        webpage = self._download_webpage(url, epTitle)
+        webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
          if dlNewest:
              url = htmlHandle.geturl()
              mobj = re.match(self._VALID_URL, url, re.VERBOSE)
              if mobj is None:
-                self._downloader.report_error(u'Invalid redirected URL: ' + url)
-                return
+                raise ExtractorError(u'Invalid redirected URL: ' + url)
              if mobj.group('episode') == '':
-                self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
-                return
+                raise ExtractorError(u'Redirected URL is still not specific: ' + url)
              epTitle = mobj.group('episode')
  
          mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
@@ -2319,8 +2289,7 @@ class ComedyCentralIE(InfoExtractor):
  
              altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
              if len(altMovieParams) == 0:
-                self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
-                return
+                raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
              else:
                  mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
  
@@ -3518,7 +3487,7 @@ class WorldStarHipHopIE(InfoExtractor):
      IE_NAME = u'WorldStarHipHop'
  
      def _real_extract(self, url):
-        _src_url = r"""(http://(hw-videos|hw-post1).*(?:mp4|flv))"""
+        _src_url = r'so\.addVariable\("file","(.*?)"\)'
  
          m = re.match(self._VALID_URL, url)
          video_id = m.group('id')
@@ -3528,7 +3497,7 @@ class WorldStarHipHopIE(InfoExtractor):
          mobj = re.search(_src_url, webpage_src)
  
          if mobj is not None:
-            video_url = mobj.group()
+            video_url = mobj.group(1)
              if 'mp4' in video_url:
                  ext = 'mp4'
              else:
@@ -4216,6 +4185,37 @@ class BandcampIE(InfoExtractor):
  
          return [track_info]
  
+class RedTubeIE(InfoExtractor):
+    """Information Extractor for redtube"""
+    _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
+
+    def _real_extract(self,url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+
+        video_id = mobj.group('id')
+        video_extension = 'mp4'        
+        webpage = self._download_webpage(url, video_id)
+        self.report_extraction(video_id)
+        mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
+
+        if mobj is None:
+            raise ExtractorError(u'Unable to extract media URL')
+
+        video_url = mobj.group(1)
+        mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
+        if mobj is None:
+            raise ExtractorError(u'Unable to extract title')
+        video_title = mobj.group(1)
+
+        return [{
+            'id':       video_id,
+            'url':      video_url,
+            'ext':      video_extension,
+            'title':    video_title,
+        }]
+
  
  def gen_extractors():
      """ Return a list of an instance of every supported extractor.
@@ -4268,11 +4268,11 @@ def gen_extractors():
          TEDIE(),
          MySpassIE(),
          SpiegelIE(),
-        RedtubeIE(),
          LiveLeakIE(),
          ARDIE(),
          TumblrIE(),
          BandcampIE(),
+        RedTubeIE(),
          GenericIE()
      ]