[facebook] Bypass download rate limits (closes #21018)
[youtube-dl] / youtube_dl / extractor / facebook.py
index f78479b92e89acfac9c46f55a92b942b70fa3317..a56f85c216177810e2ea217dd4309aee6a453b00 100644 (file)
@@ -57,7 +57,7 @@ class FacebookIE(InfoExtractor):
     _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
 
     _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
-    _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true'
+    _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary'
 
     _TESTS = [{
         'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
@@ -355,7 +355,6 @@ class FacebookIE(InfoExtractor):
             tahoe_data = self._download_webpage(
                 self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id,
                 data=urlencode_postdata({
-                    '__user': 0,
                     '__a': 1,
                     '__pc': self._search_regex(
                         r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage,
@@ -363,6 +362,9 @@ class FacebookIE(InfoExtractor):
                     '__rev': self._search_regex(
                         r'client_revision["\']\s*:\s*(\d+),', webpage,
                         'client revision', default='3944515'),
+                    'fb_dtsg': self._search_regex(
+                        r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"',
+                        webpage, 'dtsg token', default=''),
                 }),
                 headers={
                     'Content-Type': 'application/x-www-form-urlencoded',
@@ -403,6 +405,11 @@ class FacebookIE(InfoExtractor):
         if not formats:
             raise ExtractorError('Cannot find video formats')
 
+        # Downloads with browser's User-Agent are rate limited. Working around
+        # with non-browser User-Agent.
+        for f in formats:
+            f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
+
         self._sort_formats(formats)
 
         video_title = self._html_search_regex(
@@ -422,11 +429,11 @@ class FacebookIE(InfoExtractor):
         uploader = clean_html(get_element_by_id(
             'fbPhotoPageAuthorName', webpage)) or self._search_regex(
             r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',
-            fatal=False) or self._og_search_title(webpage, fatal=False)
+            default=None) or self._og_search_title(webpage, fatal=False)
         timestamp = int_or_none(self._search_regex(
             r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
             'timestamp', default=None))
-        thumbnail = self._og_search_thumbnail(webpage)
+        thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
 
         view_count = parse_count(self._search_regex(
             r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',