Merge branch 'download-archive'
authorPhilipp Hagemeister <phihag@phihag.de>
Sun, 6 Oct 2013 14:30:26 +0000 (16:30 +0200)
committerPhilipp Hagemeister <phihag@phihag.de>
Sun, 6 Oct 2013 14:30:26 +0000 (16:30 +0200)
Conflicts:
youtube_dl/YoutubeDL.py
youtube_dl/__init__.py

14 files changed:
test/helper.py
test/test_age_restriction.py [new file with mode: 0644]
test/test_download.py
youtube_dl/YoutubeDL.py
youtube_dl/__init__.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/common.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/pornotube.py
youtube_dl/extractor/viddler.py [new file with mode: 0644]
youtube_dl/extractor/youporn.py
youtube_dl/extractor/youtube.py
youtube_dl/utils.py
youtube_dl/version.py

index 8e641e3cb3054adada433b52d0a4f4ee52ad6631..884cf32dc2b8c0246744df57e64b2b60a5fcde94 100644 (file)
@@ -1,3 +1,4 @@
+import errno
 import io
 import json
 import os.path
@@ -22,18 +23,33 @@ PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "para
 with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
     parameters = json.load(pf)
 
+
+def try_rm(filename):
+    """ Remove a file if it exists """
+    try:
+        os.remove(filename)
+    except OSError as ose:
+        if ose.errno != errno.ENOENT:
+            raise
+
+
 class FakeYDL(YoutubeDL):
     def __init__(self):
-        self.result = []
         # Different instances of the downloader can't share the same dictionary
         # some test set the "sublang" parameter, which would break the md5 checks.
-        self.params = dict(parameters)
-    def to_screen(self, s):
+        params = dict(parameters)
+        super(FakeYDL, self).__init__(params)
+        self.result = []
+        
+    def to_screen(self, s, skip_eol=None):
         print(s)
+
     def trouble(self, s, tb=None):
         raise Exception(s)
+
     def download(self, x):
         self.result.append(x)
+
     def expect_warning(self, regex):
         # Silence an expected warning matching a regex
         old_report_warning = self.report_warning
diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py
new file mode 100644 (file)
index 0000000..943f9a3
--- /dev/null
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+
+import sys
+import unittest
+
+# Allow direct execution
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl import YoutubeDL
+from helper import try_rm
+
+
+def _download_restricted(url, filename, age):
+    """ Returns true iff the file has been downloaded """
+
+    params = {
+        'age_limit': age,
+        'skip_download': True,
+        'writeinfojson': True,
+        "outtmpl": "%(id)s.%(ext)s",
+    }
+    ydl = YoutubeDL(params)
+    ydl.add_default_info_extractors()
+    json_filename = filename + '.info.json'
+    try_rm(json_filename)
+    ydl.download([url])
+    res = os.path.exists(json_filename)
+    try_rm(json_filename)
+    return res
+
+
+class TestAgeRestriction(unittest.TestCase):
+    def _assert_restricted(self, url, filename, age, old_age=None):
+        self.assertTrue(_download_restricted(url, filename, old_age))
+        self.assertFalse(_download_restricted(url, filename, age))
+
+    def test_youtube(self):
+        self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10)
+
+    def test_youporn(self):
+        self._assert_restricted(
+            'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
+            '505835.mp4', 2, old_age=25)
+
+    def test_pornotube(self):
+        self._assert_restricted(
+            'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing',
+            '1689755.flv', 13)
+
+
+if __name__ == '__main__':
+    unittest.main()
index 23a66254d86ed2a68ee3ea54339838fda7d5dc71..23d3853c4d10706c379eb5ee2ed69b46c33d7584 100644 (file)
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
 
-import errno
 import hashlib
 import io
 import os
@@ -28,14 +27,6 @@ opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, You
 compat_urllib_request.install_opener(opener)
 socket.setdefaulttimeout(10)
 
-def _try_rm(filename):
-    """ Remove a file if it exists """
-    try:
-        os.remove(filename)
-    except OSError as ose:
-        if ose.errno != errno.ENOENT:
-            raise
-
 md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
 
 class YoutubeDL(youtube_dl.YoutubeDL):
@@ -54,7 +45,7 @@ def _file_md5(fn):
     with open(fn, 'rb') as f:
         return hashlib.md5(f.read()).hexdigest()
 
-from helper import get_testcases
+from helper import get_testcases, try_rm
 defs = get_testcases()
 
 with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
@@ -97,9 +88,9 @@ def generator(test_case):
 
         test_cases = test_case.get('playlist', [test_case])
         for tc in test_cases:
-            _try_rm(tc['file'])
-            _try_rm(tc['file'] + '.part')
-            _try_rm(tc['file'] + '.info.json')
+            try_rm(tc['file'])
+            try_rm(tc['file'] + '.part')
+            try_rm(tc['file'] + '.info.json')
         try:
             for retry in range(1, RETRIES + 1):
                 try:
@@ -145,9 +136,9 @@ def generator(test_case):
                     self.assertTrue(key in info_dict.keys() and info_dict[key])
         finally:
             for tc in test_cases:
-                _try_rm(tc['file'])
-                _try_rm(tc['file'] + '.part')
-                _try_rm(tc['file'] + '.info.json')
+                try_rm(tc['file'])
+                try_rm(tc['file'] + '.part')
+                try_rm(tc['file'] + '.info.json')
 
     return test_template
 
index 856e9ac929eb3512bedf6d0daf3f10f9558e25ad..073a3837c2c233535a2b5207f6ee4605952f6924 100644 (file)
@@ -85,6 +85,8 @@ class YoutubeDL(object):
     cachedir:          Location of the cache files in the filesystem.
                        None to disable filesystem cache.
     noplaylist:        Download single video instead of a playlist if in doubt.
+    age_limit:         An integer representing the user's age in years.
+                       Unsuitable videos for the given age are skipped.
     downloadarchive:   File name of a file where all downloads are recorded.
                        Videos already present in the file are not downloaded
                        again.
@@ -313,6 +315,10 @@ class YoutubeDL(object):
             dateRange = self.params.get('daterange', DateRange())
             if date not in dateRange:
                 return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
+        age_limit = self.params.get('age_limit')
+        if age_limit is not None:
+            if age_limit < info_dict.get('age_limit', 0):
+                return u'Skipping "' + title + '" because it is age restricted'
         if self.in_download_archive(info_dict):
             return (u'%(title)s has already been recorded in archive'
                     % info_dict)
index a680d7c55757a17f05f21a4f418990877b377503..ba5206387a8b24e38a5594e3b411e6a6095f797a 100644 (file)
@@ -188,6 +188,9 @@ def parseOpts(overrideArguments=None):
     selection.add_option('--datebefore', metavar='DATE', dest='datebefore', help='download only videos uploaded before this date', default=None)
     selection.add_option('--dateafter', metavar='DATE', dest='dateafter', help='download only videos uploaded after this date', default=None)
     selection.add_option('--no-playlist', action='store_true', dest='noplaylist', help='download only the currently playing video', default=False)
+    selection.add_option('--age-limit', metavar='YEARS', dest='age_limit',
+                         help='download only videos suitable for the given age',
+                         default=None, type=int)
     selection.add_option('--download-archive', metavar='FILE',
                          dest='download_archive',
                          help='Download only videos not present in the archive file. Record all downloaded videos in it.')
@@ -634,6 +637,7 @@ def _real_main(argv=None):
         'daterange': date,
         'cachedir': opts.cachedir,
         'youtube_print_sig_code': opts.youtube_print_sig_code,
+        'age_limit': opts.age_limit,
         'download_archive': opts.download_archive,
         })
 
index d1b7e5f991fa63664233670100a7ef54861c58c9..2b054e1c9498d1786f4652e6f7b72c07bdd0f0c8 100644 (file)
@@ -117,6 +117,7 @@ from .veehd import VeeHDIE
 from .veoh import VeohIE
 from .vevo import VevoIE
 from .vice import ViceIE
+from .viddler import ViddlerIE
 from .videofyme import VideofyMeIE
 from .vimeo import VimeoIE, VimeoChannelIE
 from .vine import VineIE
index 69cdcdc1b5b75d1cac5733b34565f087c9dcddec..2a5a85dc67b4f7a57d04d4f21c1608aa2c47f7f3 100644 (file)
@@ -54,6 +54,7 @@ class InfoExtractor(object):
     view_count:     How many users have watched the video on the platform.
     urlhandle:      [internal] The urlHandle to be used to download the file,
                     like returned by urllib.request.urlopen
+    age_limit:      Age restriction for the video, as an integer (years)
     formats:        A list of dictionaries for each format available, it must
                     be ordered from worst to best quality. Potential fields:
                     * url       Mandatory. The URL of the video file
@@ -318,6 +319,15 @@ class InfoExtractor(object):
                                         self._og_regex('video')],
                                        html, name, **kargs)
 
+    def _rta_search(self, html):
+        # See http://www.rtalabel.org/index.php?content=howtofaq#single
+        if re.search(r'(?ix)<meta\s+name="rating"\s+'
+                     r'     content="RTA-5042-1996-1400-1577-RTA"',
+                     html):
+            return 18
+        return 0
+
+
 class SearchInfoExtractor(InfoExtractor):
     """
     Base class for paged search queries extractors.
index 764070635160d96487baa53b4370b28e8a957316..7060c6f9258c28c9dcb18681c62882f52715edf9 100644 (file)
@@ -117,7 +117,7 @@ class GenericIE(InfoExtractor):
         except ValueError:
             # since this is the last-resort InfoExtractor, if
             # this error is thrown, it'll be thrown here
-            raise ExtractorError(u'Invalid URL: %s' % url)
+            raise ExtractorError(u'Failed to download URL: %s' % url)
 
         self.report_extraction(video_id)
         # Look for BrightCove:
@@ -149,12 +149,12 @@ class GenericIE(InfoExtractor):
             # HTML5 video
             mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
         if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
+            raise ExtractorError(u'Unsupported URL: %s' % url)
 
         # It's possible that one of the regexes
         # matched, but returned an empty group:
         if mobj.group(1) is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
+            raise ExtractorError(u'Did not find a valid video URL at %s' % url)
 
         video_url = mobj.group(1)
         video_url = compat_urlparse.urljoin(url, video_url)
index add76a11e5f2c0c17af76b71db6e8bd07adc6cd6..5d770ec285c3d1e3dcad04cfe49ca7780a9dd2b4 100644 (file)
@@ -38,6 +38,7 @@ class PornotubeIE(InfoExtractor):
         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
         if upload_date: upload_date = unified_strdate(upload_date)
+        age_limit = self._rta_search(webpage)
 
         info = {'id': video_id,
                 'url': video_url,
@@ -45,6 +46,7 @@ class PornotubeIE(InfoExtractor):
                 'upload_date': upload_date,
                 'title': video_title,
                 'ext': 'flv',
-                'format': 'flv'}
+                'format': 'flv',
+                'age_limit': age_limit}
 
         return [info]
diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py
new file mode 100644 (file)
index 0000000..12c84a9
--- /dev/null
@@ -0,0 +1,64 @@
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+)
+
+
+class ViddlerIE(InfoExtractor):
+    _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[0-9]+)'
+    _TEST = {
+        u"url": u"http://www.viddler.com/v/43903784",
+        u'file': u'43903784.mp4',
+        u'md5': u'fbbaedf7813e514eb7ca30410f439ac9',
+        u'info_dict': {
+            u"title": u"Video Made Easy",
+            u"uploader": u"viddler",
+            u"duration": 100.89,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        embed_url = mobj.group('domain') + u'/embed/' + video_id
+        webpage = self._download_webpage(embed_url, video_id)
+
+        video_sources_code = self._search_regex(
+            r"(?ms)sources\s*:\s*(\{.*?\})", webpage, u'video URLs')
+        video_sources = json.loads(video_sources_code.replace("'", '"'))
+
+        formats = [{
+            'url': video_url,
+            'format': format_id,
+        } for video_url, format_id in video_sources.items()]
+
+        title = self._html_search_regex(
+            r"title\s*:\s*'([^']*)'", webpage, u'title')
+        uploader = self._html_search_regex(
+            r"authorName\s*:\s*'([^']*)'", webpage, u'uploader', fatal=False)
+        duration_s = self._html_search_regex(
+            r"duration\s*:\s*([0-9.]*)", webpage, u'duration', fatal=False)
+        duration = float(duration_s) if duration_s else None
+        thumbnail = self._html_search_regex(
+            r"thumbnail\s*:\s*'([^']*)'",
+            webpage, u'thumbnail', fatal=False)
+
+        info = {
+            '_type': 'video',
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'duration': duration,
+            'formats': formats,
+        }
+
+        # TODO: Remove when #980 has been merged
+        info['formats'][-1]['ext'] = determine_ext(info['formats'][-1]['url'])
+        info.update(info['formats'][-1])
+
+        return info
index c85fd4b5af0ccdd3f259bd403ddd4311f2de5fdb..b1f93dd1bb90d964916394d88d83aaaf153ba15b 100644 (file)
@@ -51,6 +51,7 @@ class YouPornIE(InfoExtractor):
         req = compat_urllib_request.Request(url)
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
+        age_limit = self._rta_search(webpage)
 
         # Get JSON parameters
         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
@@ -115,7 +116,8 @@ class YouPornIE(InfoExtractor):
                 'ext': extension,
                 'format': format,
                 'thumbnail': thumbnail,
-                'description': video_description
+                'description': video_description,
+                'age_limit': age_limit,
             })
 
         if self._downloader.params.get('listformats', None):
index 1101011ea38aedc4d2a46aecb3c357861e441929..b02ae25727f2843be3952670e8eba617ff23b749 100644 (file)
@@ -1495,7 +1495,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                 'description':  video_description,
                 'player_url':   player_url,
                 'subtitles':    video_subtitles,
-                'duration':     video_duration
+                'duration':     video_duration,
+                'age_limit':    18 if age_gate else 0,
             })
         return results
 
index a463049a4d189f3d2b065f7a5955c5c079a28ca5..de26547621b2172b8bea45083606c407882346d2 100644 (file)
@@ -175,7 +175,7 @@ def compat_ord(c):
 compiled_regex_type = type(re.compile(''))
 
 std_headers = {
-    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     'Accept-Encoding': 'gzip, deflate',
index e773e82dae44d0b8099aeead68b9ec3103b91294..08eda219768c5f58078745a94141264d28fb4b8f 100644 (file)
@@ -1,2 +1,2 @@
 
-__version__ = '2013.10.04'
+__version__ = '2013.10.06'