Merge pull request #1660 from pyed/master
authorFilippo Valsorda <filippo.valsorda@gmail.com>
Mon, 28 Oct 2013 04:14:19 +0000 (21:14 -0700)
committerFilippo Valsorda <filippo.valsorda@gmail.com>
Mon, 28 Oct 2013 04:14:19 +0000 (21:14 -0700)
[addanime] try to download HQ before normal

youtube_dl/extractor/facebook.py
youtube_dl/extractor/faz.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/rtlnow.py
youtube_dl/extractor/xhamster.py

index 9d1bc07510c3148b8ed8659d697c46017c6a36ff..f8bdfc2d33c9f00b9f902a4303eb7024f4646312 100644 (file)
@@ -19,7 +19,8 @@ class FacebookIE(InfoExtractor):
     """Information Extractor for Facebook"""
 
     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
-    _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
+    _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
+    _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
     _NETRC_MACHINE = 'facebook'
     IE_NAME = u'facebook'
     _TEST = {
@@ -36,50 +37,56 @@ class FacebookIE(InfoExtractor):
         """Report attempt to log in."""
         self.to_screen(u'Logging in')
 
-    def _real_initialize(self):
-        if self._downloader is None:
-            return
-
-        useremail = None
-        password = None
-        downloader_params = self._downloader.params
-
-        # Attempt to use provided username and password or .netrc data
-        if downloader_params.get('username', None) is not None:
-            useremail = downloader_params['username']
-            password = downloader_params['password']
-        elif downloader_params.get('usenetrc', False):
-            try:
-                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
-                if info is not None:
-                    useremail = info[0]
-                    password = info[2]
-                else:
-                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
-            except (IOError, netrc.NetrcParseError) as err:
-                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
-                return
-
+    def _login(self):
+        (useremail, password) = self._get_login_info()
         if useremail is None:
             return
 
-        # Log in
+        login_page_req = compat_urllib_request.Request(self._LOGIN_URL)
+        login_page_req.add_header('Cookie', 'locale=en_US')
+        self.report_login()
+        login_page = self._download_webpage(login_page_req, None, note=False,
+            errnote=u'Unable to download login page')
+        lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd')
+        lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd')
+
         login_form = {
             'email': useremail,
             'pass': password,
-            'login': 'Log+In'
+            'lsd': lsd,
+            'lgnrnd': lgnrnd,
+            'next': 'http://facebook.com/home.php',
+            'default_persistent': '0',
+            'legacy_return': '1',
+            'timezone': '-60',
+            'trynum': '1',
             }
         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
         try:
-            self.report_login()
             login_results = compat_urllib_request.urlopen(request).read()
             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
                 return
+
+            check_form = {
+                'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'),
+                'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'),
+                'name_action_selected': 'dont_save',
+                'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'),
+            }
+            check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form))
+            check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
+            check_response = compat_urllib_request.urlopen(check_req).read()
+            if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
+                self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
             return
 
+    def _real_initialize(self):
+        self._login()
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         if mobj is None:
@@ -93,7 +100,13 @@ class FacebookIE(InfoExtractor):
         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
         if not m:
-            raise ExtractorError(u'Cannot parse data')
+            m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
+            if m_msg is not None:
+                raise ExtractorError(
+                    u'The video is not available, Facebook said: "%s"' % m_msg.group(1),
+                    expected=True)
+            else:
+                raise ExtractorError(u'Cannot parse data')
         data = dict(json.loads(m.group(1)))
         params_raw = compat_urllib_parse.unquote(data['params'])
         params = json.loads(params_raw)
index deaa4ed2d9bc14406b6a7d3d6e8b015c6fcf915d..89ed08db4cbb99f9381013813fa03a19474c8e24 100644 (file)
@@ -5,8 +5,6 @@ import xml.etree.ElementTree
 from .common import InfoExtractor
 from ..utils import (
     determine_ext,
-    clean_html,
-    get_element_by_attribute,
 )
 
 
@@ -47,12 +45,12 @@ class FazIE(InfoExtractor):
                 'format_id': code.lower(),
             })
 
-        descr_html = get_element_by_attribute('class', 'Content Copy', webpage)
+        descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description')
         info = {
             'id': video_id,
             'title': self._og_search_title(webpage),
             'formats': formats,
-            'description': clean_html(descr_html),
+            'description': descr,
             'thumbnail': config.find('STILL/STILL_BIG').text,
         }
         # TODO: Remove when #980 has been merged
index 69e0a7bd271dd0965f5c1f6f9c3a7cdce7a3da0b..2c8fcf5ae5df24a1dedc5e461feb5ac2300688a6 100644 (file)
@@ -25,7 +25,7 @@ class GenericIE(InfoExtractor):
         {
             u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
             u'file': u'13601338388002.mp4',
-            u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
+            u'md5': u'6e15c93721d7ec9e9ca3fdbf07982cfd',
             u'info_dict': {
                 u"uploader": u"www.hodiho.fr",
                 u"title": u"R\u00e9gis plante sa Jeep"
@@ -41,7 +41,17 @@ class GenericIE(InfoExtractor):
                 u"uploader_id": u"skillsmatter",
                 u"uploader": u"Skills Matter",
             }
-        }
+        },
+        # bandcamp page with custom domain
+        {
+            u'url': u'http://bronyrock.com/track/the-pony-mash',
+            u'file': u'3235767654.mp3',
+            u'info_dict': {
+                u'title': u'The Pony Mash',
+                u'uploader': u'M_Pallante',
+            },
+            u'skip': u'There is a limit of 200 free downloads / month for the test song',
+        },
     ]
 
     def report_download_webpage(self, video_id):
@@ -155,6 +165,12 @@ class GenericIE(InfoExtractor):
             surl = unescapeHTML(mobj.group(1))
             return self.url_result(surl, 'Youtube')
 
+        # Look for Bandcamp pages with custom domain
+        mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
+        if mobj is not None:
+            burl = unescapeHTML(mobj.group(1))
+            return self.url_result(burl, 'Bandcamp')
+
         # Start with something easy: JW Player in SWFObject
         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
         if mobj is None:
index d1b08c9bc050b3639ca252f2e84a373a8e4fa5f9..9ac7c3be8c8f1b97f46c944f08124eafbe8f1a5a 100644 (file)
@@ -63,13 +63,12 @@ class RTLnowIE(InfoExtractor):
         },
     },
     {
-        u'url': u'http://www.rtlnitronow.de/recht-ordnung/lebensmittelkontrolle-erlangenordnungsamt-berlin.php?film_id=127367&player=1&season=1',
-        u'file': u'127367.flv',
+        u'url': u'http://www.rtlnitronow.de/recht-ordnung/stadtpolizei-frankfurt-gerichtsvollzieher-leipzig.php?film_id=129679&player=1&season=1',
+        u'file': u'129679.flv',
         u'info_dict': {
-            u'upload_date': u'20130926', 
-            u'title': u'Recht & Ordnung - Lebensmittelkontrolle Erlangen/Ordnungsamt...',
-            u'description': u'Lebensmittelkontrolle Erlangen/Ordnungsamt Berlin',
-            u'thumbnail': u'http://autoimg.static-fra.de/nitronow/344787/1500x1500/image2.jpg',
+            u'upload_date': u'20131016', 
+            u'title': u'Recht & Ordnung - Stadtpolizei Frankfurt/ Gerichtsvollzieher...',
+            u'description': u'Stadtpolizei Frankfurt/ Gerichtsvollzieher Leipzig',
         },
         u'params': {
             u'skip_download': True,
index 81c4be3269150aafc72f6ff0980923119c299373..7444d3393a25f8a49778a5bd589aa839591bd9d8 100644 (file)
@@ -36,21 +36,25 @@ class XHamsterIE(InfoExtractor):
     }]
 
     def _real_extract(self,url):
+        def extract_video_url(webpage):
+            mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
+            if mobj is None:
+                raise ExtractorError(u'Unable to extract media URL')
+            if len(mobj.group('server')) == 0:
+                return compat_urllib_parse.unquote(mobj.group('file'))
+            else:
+                return mobj.group('server')+'/key='+mobj.group('file')
+
+        def is_hd(webpage):
+            return webpage.find('<div class=\'icon iconHD\'>') != -1
+
         mobj = re.match(self._VALID_URL, url)
 
         video_id = mobj.group('id')
         seo = mobj.group('seo')
-        mrss_url = 'http://xhamster.com/movies/%s/%s.html?hd' % (video_id, seo)
+        mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo)
         webpage = self._download_webpage(mrss_url, video_id)
 
-        mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract media URL')
-        if len(mobj.group('server')) == 0:
-            video_url = compat_urllib_parse.unquote(mobj.group('file'))
-        else:
-            video_url = mobj.group('server')+'/key='+mobj.group('file')
-
         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
             webpage, u'title')
 
@@ -76,14 +80,32 @@ class XHamsterIE(InfoExtractor):
 
         age_limit = self._rta_search(webpage)
 
-        return [{
-            'id':       video_id,
-            'url':      video_url,
-            'ext':      determine_ext(video_url),
-            'title':    video_title,
+        video_url = extract_video_url(webpage)
+        hd = is_hd(webpage)
+        formats = [{
+            'url': video_url,
+            'ext': determine_ext(video_url),
+            'format': 'hd' if hd else 'sd',
+            'format_id': 'hd' if hd else 'sd',
+        }]
+        if not hd:
+            webpage = self._download_webpage(mrss_url+'?hd', video_id)
+            if is_hd(webpage):
+                video_url = extract_video_url(webpage)
+                formats.append({
+                    'url': video_url,
+                    'ext': determine_ext(video_url),
+                    'format': 'hd',
+                    'format_id': 'hd',
+                })
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats,
             'description': video_description,
             'upload_date': video_upload_date,
             'uploader_id': video_uploader_id,
             'thumbnail': video_thumbnail,
             'age_limit': age_limit,
-        }]
+        }