Merge pull request #6439 from remitamine/facebook
authorSergey M. <dstftw@gmail.com>
Sun, 2 Aug 2015 23:43:01 +0000 (05:43 +0600)
committerSergey M. <dstftw@gmail.com>
Sun, 2 Aug 2015 23:43:01 +0000 (05:43 +0600)
[facebook] extract uploader

AUTHORS
README.md
youtube_dl/downloader/http.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/nowtv.py
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/screenwavemedia.py

diff --git a/AUTHORS b/AUTHORS
index aa6b88cc0bc625aab169db6f587ca4cf9275377c..d16d34272a5ae3b17ee6edd98fbea3eb371254b5 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -136,3 +136,4 @@ sceext
 Zach Bruggeman
 Tjark Saul
 slangangular
+Behrouz Abbasi
index ac54d7b67b8c36d370495759153678f711ac614e..2db3139eecaf9a6dfa1b82ead6553ab7b684208f 100644 (file)
--- a/README.md
+++ b/README.md
@@ -439,6 +439,12 @@ Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the opt
     youtube-dl -- -wNyEUrxzFU
     youtube-dl "http://www.youtube.com/watch?v=-wNyEUrxzFU"
 
+### How do I pass cookies to youtube-dl?
+
+Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Note that cookies file must be in Mozilla/Netscape format and the first line of cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in cookies file and convert newlines if necessary to correspond your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format.
+
+Passing cookies to youtube-dl is a good way to workaround login when particular extractor does not implement it explicitly.
+
 ### Can you add support for this anime video site, or site which shows current movies for free?
 
 As a matter of policy (as well as legality), youtube-dl does not include support for services that specialize in infringing copyright. As a rule of thumb, if you cannot easily find a video that the service is quite obviously allowed to distribute (i.e. that has been uploaded by the creator, the creator's distributor, or is published under a free license), the service is probably unfit for inclusion to youtube-dl.
index b7f144af9ea33a102246632e04e71707be3d98ad..a29f5cf31fddf3f3b15f47311fafeef9ab4f3d7a 100644 (file)
@@ -4,6 +4,7 @@ import errno
 import os
 import socket
 import time
+import re
 
 from .common import FileDownloader
 from ..compat import (
@@ -57,6 +58,24 @@ class HttpFD(FileDownloader):
             # Establish connection
             try:
                 data = self.ydl.urlopen(request)
+                # When trying to resume, Content-Range HTTP header of response has to be checked
+                # to match the value of requested Range HTTP header. This is due to a webservers
+                # that don't support resuming and serve a whole file with no Content-Range
+                # set in response despite of requested Range (see
+                # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799)
+                if resume_len > 0:
+                    content_range = data.headers.get('Content-Range')
+                    if content_range:
+                        content_range_m = re.search(r'bytes (\d+)-', content_range)
+                        # Content-Range is present and matches requested Range, resume is possible
+                        if content_range_m and resume_len == int(content_range_m.group(1)):
+                            break
+                    # Content-Range is either not present or invalid. Assuming remote webserver is
+                    # trying to send the whole file, resume is not possible, so wiping the local file
+                    # and performing entire redownload
+                    self.report_unable_to_resume()
+                    resume_len = 0
+                    open_mode = 'wb'
                 break
             except (compat_urllib_error.HTTPError, ) as err:
                 if (err.code < 500 or err.code >= 600) and err.code != 416:
index 85d945509d07ca686490c3471d0b109991524ed1..2d90b222463678b2c49e096f1133a8753921965f 100644 (file)
@@ -15,7 +15,6 @@ from ..utils import (
     ExtractorError,
     determine_ext,
     int_or_none,
-    orderedSet,
     parse_iso8601,
     str_to_int,
     unescapeHTML,
@@ -278,7 +277,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
     }]
 
     def _extract_entries(self, id):
-        video_ids = []
+        video_ids = set()
         processed_urls = set()
         for pagenum in itertools.count(1):
             page_url = self._PAGE_TEMPLATE % (id, pagenum)
@@ -291,12 +290,13 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
 
             processed_urls.add(urlh.geturl())
 
-            video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage))
+            for video_id in re.findall(r'data-xid="(.+?)"', webpage):
+                if video_id not in video_ids:
+                    yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
+                    video_ids.add(video_id)
 
             if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
                 break
-        return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
-                for video_id in orderedSet(video_ids)]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
index 0b5ff47600559e50a2282b184b23d3fc7263d46d..ad938fb6287b1b1de9ae6883a38405ea01070f93 100644 (file)
@@ -1,12 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..compat import compat_str
 from ..utils import (
     ExtractorError,
+    determine_ext,
     int_or_none,
     parse_iso8601,
     parse_duration,
@@ -15,7 +14,7 @@ from ..utils import (
 
 
 class NowTVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player'
+    _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)'
 
     _TESTS = [{
         # rtl
@@ -23,7 +22,7 @@ class NowTVIE(InfoExtractor):
         'info_dict': {
             'id': '203519',
             'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
-            'ext': 'mp4',
+            'ext': 'flv',
             'title': 'Die neuen Bauern und eine Hochzeit',
             'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',
             'thumbnail': 're:^https?://.*\.jpg$',
@@ -32,7 +31,7 @@ class NowTVIE(InfoExtractor):
             'duration': 2786,
         },
         'params': {
-            # m3u8 download
+            # rtmp download
             'skip_download': True,
         },
     }, {
@@ -41,7 +40,7 @@ class NowTVIE(InfoExtractor):
         'info_dict': {
             'id': '203481',
             'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934',
-            'ext': 'mp4',
+            'ext': 'flv',
             'title': 'Berlin - Tag & Nacht (Folge 934)',
             'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0',
             'thumbnail': 're:^https?://.*\.jpg$',
@@ -50,7 +49,7 @@ class NowTVIE(InfoExtractor):
             'duration': 2641,
         },
         'params': {
-            # m3u8 download
+            # rtmp download
             'skip_download': True,
         },
     }, {
@@ -59,7 +58,7 @@ class NowTVIE(InfoExtractor):
         'info_dict': {
             'id': '165780',
             'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00',
-            'ext': 'mp4',
+            'ext': 'flv',
             'title': 'Hals- und Beinbruch',
             'description': 'md5:b50d248efffe244e6f56737f0911ca57',
             'thumbnail': 're:^https?://.*\.jpg$',
@@ -68,7 +67,7 @@ class NowTVIE(InfoExtractor):
             'duration': 2742,
         },
         'params': {
-            # m3u8 download
+            # rtmp download
             'skip_download': True,
         },
     }, {
@@ -77,7 +76,7 @@ class NowTVIE(InfoExtractor):
         'info_dict': {
             'id': '99205',
             'display_id': 'medicopter-117/angst',
-            'ext': 'mp4',
+            'ext': 'flv',
             'title': 'Angst!',
             'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e',
             'thumbnail': 're:^https?://.*\.jpg$',
@@ -86,7 +85,7 @@ class NowTVIE(InfoExtractor):
             'duration': 3025,
         },
         'params': {
-            # m3u8 download
+            # rtmp download
             'skip_download': True,
         },
     }, {
@@ -95,7 +94,7 @@ class NowTVIE(InfoExtractor):
         'info_dict': {
             'id': '203521',
             'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch',
-            'ext': 'mp4',
+            'ext': 'flv',
             'title': 'Thema u.a.: Der erste Blick: Die Apple Watch',
             'description': 'md5:4312b6c9d839ffe7d8caf03865a531af',
             'thumbnail': 're:^https?://.*\.jpg$',
@@ -104,7 +103,7 @@ class NowTVIE(InfoExtractor):
             'duration': 1083,
         },
         'params': {
-            # m3u8 download
+            # rtmp download
             'skip_download': True,
         },
     }, {
@@ -113,7 +112,7 @@ class NowTVIE(InfoExtractor):
         'info_dict': {
             'id': '128953',
             'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel',
-            'ext': 'mp4',
+            'ext': 'flv',
             'title': "Büro-Fall / Chihuahua 'Joel'",
             'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d',
             'thumbnail': 're:^https?://.*\.jpg$',
@@ -122,15 +121,16 @@ class NowTVIE(InfoExtractor):
             'duration': 3092,
         },
         'params': {
-            # m3u8 download
+            # rtmp download
             'skip_download': True,
         },
+    }, {
+        'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('id')
-        station = mobj.group('station')
+        display_id = self._match_id(url)
 
         info = self._download_json(
             'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id,
@@ -148,29 +148,19 @@ class NowTVIE(InfoExtractor):
                 raise ExtractorError(
                     'Video %s is not available for free' % video_id, expected=True)
 
-        f = info.get('format', {})
-        station = f.get('station') or station
-
-        STATIONS = {
-            'rtl': 'rtlnow',
-            'rtl2': 'rtl2now',
-            'vox': 'voxnow',
-            'nitro': 'rtlnitronow',
-            'ntv': 'n-tvnow',
-            'superrtl': 'superrtlnow'
-        }
-
         formats = []
         for item in files['items']:
-            item_path = remove_start(item['path'], '/')
-            tbr = int_or_none(item['bitrate'])
-            m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path)
-            m3u8_url = m3u8_url.replace('now/', 'now/videos/')
+            if determine_ext(item['path']) != 'f4v':
+                continue
+            app, play_path = remove_start(item['path'], '/').split('/', 1)
             formats.append({
-                'url': m3u8_url,
-                'format_id': '%s-%sk' % (item['id'], tbr),
-                'ext': 'mp4',
-                'tbr': tbr,
+                'url': 'rtmpe://fms.rtl.de',
+                'app': app,
+                'play_path': 'mp4:%s' % play_path,
+                'ext': 'flv',
+                'page_url': url,
+                'player_url': 'http://rtl-now.rtl.de/includes/nc_player.swf',
+                'tbr': int_or_none(item.get('bitrate')),
             })
         self._sort_formats(formats)
 
@@ -178,6 +168,8 @@ class NowTVIE(InfoExtractor):
         description = info.get('articleLong') or info.get('articleShort')
         timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
         duration = parse_duration(info.get('duration'))
+
+        f = info.get('format', {})
         thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
 
         return {
index 0b7886840fbced3d9fa6fb219050f40ac709c080..fec493046861801e3e195d533704ed5f48d3dd7f 100644 (file)
@@ -81,7 +81,7 @@ class PornHubIE(InfoExtractor):
         comment_count = self._extract_count(
             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
 
-        video_urls = list(map(compat_urllib_parse_unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
+        video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage)))
         if webpage.find('"encrypted":true') != -1:
             password = compat_urllib_parse_unquote_plus(
                 self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
index d1ab66b3216d5153a5480769fb0723919f3fdb37..3bc84989e2b4d4c842714a1d815070eeb05e45c9 100644 (file)
@@ -1,12 +1,11 @@
 # encoding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
     unified_strdate,
+    js_to_json,
 )
 
 
@@ -22,59 +21,48 @@ class ScreenwaveMediaIE(InfoExtractor):
         video_id = self._match_id(url)
 
         playerdata = self._download_webpage(
-            'http://player.screenwavemedia.com/play/player.php?id=%s' % video_id,
+            'http://player.screenwavemedia.com/player.php?id=%s' % video_id,
             video_id, 'Downloading player webpage')
 
         vidtitle = self._search_regex(
             r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/')
-        vidurl = self._search_regex(
-            r'\'vidurl\'\s*:\s*"([^"]+)"', playerdata, 'vidurl').replace('\\/', '/')
-
-        videolist_url = None
-
-        mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata)
-        if mobj:
-            videoserver = mobj.group('videoserver')
-            mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata)
-            vidid = mobj.group('vidid') if mobj else video_id
-            videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)
-        else:
-            mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata)
-            if mobj:
-                videolist_url = mobj.group('smil')
-
-        if videolist_url:
-            videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
-            formats = []
-            baseurl = vidurl[:vidurl.rfind('/') + 1]
-            for video in videolist.findall('.//video'):
-                src = video.get('src')
-                if not src:
-                    continue
-                file_ = src.partition(':')[-1]
-                width = int_or_none(video.get('width'))
-                height = int_or_none(video.get('height'))
-                bitrate = int_or_none(video.get('system-bitrate'), scale=1000)
-                format = {
-                    'url': baseurl + file_,
-                    'format_id': src.rpartition('.')[0].rpartition('_')[-1],
-                }
-                if width or height:
-                    format.update({
-                        'tbr': bitrate,
-                        'width': width,
-                        'height': height,
-                    })
-                else:
-                    format.update({
-                        'abr': bitrate,
-                        'vcodec': 'none',
-                    })
-                formats.append(format)
-        else:
-            formats = [{
-                'url': vidurl,
-            }]
+
+        playerconfig = self._download_webpage(
+            'http://player.screenwavemedia.com/player.js',
+            video_id, 'Downloading playerconfig webpage')
+
+        videoserver = self._search_regex(r"\[ipaddress\]\s*=>\s*([\d\.]+)", playerdata, 'videoserver')
+
+        sources = self._parse_json(
+            js_to_json(
+                self._search_regex(
+                    r"sources\s*:\s*(\[[^\]]+?\])", playerconfig,
+                    'sources',
+                ).replace(
+                    "' + thisObj.options.videoserver + '",
+                    videoserver
+                ).replace(
+                    "' + playerVidId + '",
+                    video_id
+                )
+            ),
+            video_id
+        )
+
+        formats = []
+        for source in sources:
+            if source['type'] == 'hls':
+                formats.extend(self._extract_m3u8_formats(source['file'], video_id))
+            else:
+                format_label = source.get('label')
+                height = int_or_none(self._search_regex(
+                    r'^(\d+)[pP]', format_label, 'height', default=None))
+                formats.append({
+                    'url': source['file'],
+                    'format': format_label,
+                    'ext': source.get('type'),
+                    'height': height,
+                })
         self._sort_formats(formats)
 
         return {