Merge branch 'next-url'

author Filippo Valsorda <filippo.valsorda@gmail.com>

Sun, 25 Mar 2012 00:07:47 +0000 (01:07 +0100)

committer Filippo Valsorda <filippo.valsorda@gmail.com>

Sun, 25 Mar 2012 00:07:47 +0000 (01:07 +0100)
author Filippo Valsorda <filippo.valsorda@gmail.com>
Sun, 25 Mar 2012 00:07:47 +0000 (01:07 +0100)
committer Filippo Valsorda <filippo.valsorda@gmail.com>
Sun, 25 Mar 2012 00:07:47 +0000 (01:07 +0100)
diff --combined youtube-dl

index 5a595901ccc1e8d36a9b24c00966104e05515ebf,595cce497ca677b8f7de61a02a01c30c0755c9e0..8d0d1cc3381afab236486af52f9712c110cfa311
--- 1/youtube-dl
--- 2/youtube-dl
+++ b/youtube-dl
@@@ -1176,6 -1176,7 +1176,7 @@@ class YoutubeIE(InfoExtractor)
         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
+       _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
         _NETRC_MACHINE = 'youtube'
         # Listed in order of quality
         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
@@@ -1336,6 -1337,11 +1337,11 @@@
                         return
   
         def _real_extract(self, url):
+               # Extract original video URL from URL with redirection, like age verification, using next_url parameter
+               mobj = re.search(self._NEXT_URL_RE, url)
+               if mobj:
+                       url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
+ 
                 # Extract video id from URL
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
@@@ -2241,67 -2247,7 +2247,67 @@@ class GenericIE(InfoExtractor)
                 """Report information extraction."""
                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
   
+ +      def report_following_redirect(self, new_url):
+ +              """Report information extraction."""
+ +              self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
+ +              
+ +      def _test_redirect(self, url):
+ +              """Check if it is a redirect, like url shorteners, in case restart chain."""
+ +              class HeadRequest(urllib2.Request):
+ +                      def get_method(self):
+ +                              return "HEAD"
+ +
+ +              class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
+ +                      """
+ +                      Subclass the HTTPRedirectHandler to make it use our 
+ +                      HeadRequest also on the redirected URL
+ +                      """
+ +                      def redirect_request(self, req, fp, code, msg, headers, newurl): 
+ +                              if code in (301, 302, 303, 307):
+ +                                  newurl = newurl.replace(' ', '%20') 
+ +                                  newheaders = dict((k,v) for k,v in req.headers.items()
+ +                                                    if k.lower() not in ("content-length", "content-type"))
+ +                                  return HeadRequest(newurl, 
+ +                                                     headers=newheaders,
+ +                                                     origin_req_host=req.get_origin_req_host(), 
+ +                                                     unverifiable=True) 
+ +                              else: 
+ +                                  raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
+ +                                  
+ +              class HTTPMethodFallback(urllib2.BaseHandler):
+ +                      """
+ +                      Fallback to GET if HEAD is not allowed (405 HTTP error)
+ +                      """
+ +                      def http_error_405(self, req, fp, code, msg, headers): 
+ +                              fp.read()
+ +                              fp.close()
+ +
+ +                              newheaders = dict((k,v) for k,v in req.headers.items()
+ +                                                if k.lower() not in ("content-length", "content-type"))
+ +                              return self.parent.open(urllib2.Request(req.get_full_url(), 
+ +                                                               headers=newheaders, 
+ +                                                               origin_req_host=req.get_origin_req_host(), 
+ +                                                               unverifiable=True))
+ +
+ +              # Build our opener
+ +              opener = urllib2.OpenerDirector() 
+ +              for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
+ +                                      HTTPMethodFallback, HEADRedirectHandler,
+ +                                      urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
+ +                      opener.add_handler(handler())
+ +
+ +              response = opener.open(HeadRequest(url))
+ +              new_url = response.geturl()
+ +              
+ +              if url == new_url: return False
+ +              
+ +              self.report_following_redirect(new_url)
+ +              self._downloader.download([new_url])
+ +              return True
+ +
         def _real_extract(self, url):
+ +              if self._test_redirect(url): return
+ +              
                 # At this point we have a new video
                 self._downloader.increment_downloads()
   
@@@ -4624,6 -4570,7 +4630,7 @@@ def _real_main()
                 except IOError:
                         sys.exit(u'ERROR: batch file could not be read')
         all_urls = batchurls + args
+       all_urls = map(lambda url: url.strip(), all_urls)
   
         # General configuration
         cookie_processor = urllib2.HTTPCookieProcessor(jar)
diff --combined youtube_dl/__init__.py

index 5a595901ccc1e8d36a9b24c00966104e05515ebf,595cce497ca677b8f7de61a02a01c30c0755c9e0..8d0d1cc3381afab236486af52f9712c110cfa311
--- 1/youtube_dl/__init__.py
--- 2/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@@ -1176,6 -1176,7 +1176,7 @@@ class YoutubeIE(InfoExtractor)
         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
+       _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
         _NETRC_MACHINE = 'youtube'
         # Listed in order of quality
         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
@@@ -1336,6 -1337,11 +1337,11 @@@
                         return
   
         def _real_extract(self, url):
+               # Extract original video URL from URL with redirection, like age verification, using next_url parameter
+               mobj = re.search(self._NEXT_URL_RE, url)
+               if mobj:
+                       url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
+ 
                 # Extract video id from URL
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
@@@ -2241,67 -2247,7 +2247,67 @@@ class GenericIE(InfoExtractor)
                 """Report information extraction."""
                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
   
+ +      def report_following_redirect(self, new_url):
+ +              """Report information extraction."""
+ +              self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
+ +              
+ +      def _test_redirect(self, url):
+ +              """Check if it is a redirect, like url shorteners, in case restart chain."""
+ +              class HeadRequest(urllib2.Request):
+ +                      def get_method(self):
+ +                              return "HEAD"
+ +
+ +              class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
+ +                      """
+ +                      Subclass the HTTPRedirectHandler to make it use our 
+ +                      HeadRequest also on the redirected URL
+ +                      """
+ +                      def redirect_request(self, req, fp, code, msg, headers, newurl): 
+ +                              if code in (301, 302, 303, 307):
+ +                                  newurl = newurl.replace(' ', '%20') 
+ +                                  newheaders = dict((k,v) for k,v in req.headers.items()
+ +                                                    if k.lower() not in ("content-length", "content-type"))
+ +                                  return HeadRequest(newurl, 
+ +                                                     headers=newheaders,
+ +                                                     origin_req_host=req.get_origin_req_host(), 
+ +                                                     unverifiable=True) 
+ +                              else: 
+ +                                  raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 
+ +                                  
+ +              class HTTPMethodFallback(urllib2.BaseHandler):
+ +                      """
+ +                      Fallback to GET if HEAD is not allowed (405 HTTP error)
+ +                      """
+ +                      def http_error_405(self, req, fp, code, msg, headers): 
+ +                              fp.read()
+ +                              fp.close()
+ +
+ +                              newheaders = dict((k,v) for k,v in req.headers.items()
+ +                                                if k.lower() not in ("content-length", "content-type"))
+ +                              return self.parent.open(urllib2.Request(req.get_full_url(), 
+ +                                                               headers=newheaders, 
+ +                                                               origin_req_host=req.get_origin_req_host(), 
+ +                                                               unverifiable=True))
+ +
+ +              # Build our opener
+ +              opener = urllib2.OpenerDirector() 
+ +              for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
+ +                                      HTTPMethodFallback, HEADRedirectHandler,
+ +                                      urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
+ +                      opener.add_handler(handler())
+ +
+ +              response = opener.open(HeadRequest(url))
+ +              new_url = response.geturl()
+ +              
+ +              if url == new_url: return False
+ +              
+ +              self.report_following_redirect(new_url)
+ +              self._downloader.download([new_url])
+ +              return True
+ +
         def _real_extract(self, url):
+ +              if self._test_redirect(url): return
+ +              
                 # At this point we have a new video
                 self._downloader.increment_downloads()
   
@@@ -4624,6 -4570,7 +4630,7 @@@ def _real_main()
                 except IOError:
                         sys.exit(u'ERROR: batch file could not be read')
         all_urls = batchurls + args
+       all_urls = map(lambda url: url.strip(), all_urls)
   
         # General configuration
         cookie_processor = urllib2.HTTPCookieProcessor(jar)
author	Filippo Valsorda <filippo.valsorda@gmail.com>
	Sun, 25 Mar 2012 00:07:47 +0000 (01:07 +0100)
committer	Filippo Valsorda <filippo.valsorda@gmail.com>
	Sun, 25 Mar 2012 00:07:47 +0000 (01:07 +0100)
		1	2
youtube-dl	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/__init__.py	patch \|	diff1 \|	diff2 \|	blob \| history