Merge remote-tracking branch 'Dineshs91/f4m-2.0'
[youtube-dl] / youtube_dl / extractor / archiveorg.py
index 61ce4469a05dd3cdf9bddbecf8c82119c40b5c3f..9fc35a42b8612d828ccc3ae43c9e4f74782f5352 100644 (file)
@@ -1,55 +1,61 @@
-import json
-import re
+from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import (
-    determine_ext,
-    unified_strdate,
-)
+from ..utils import unified_strdate
 
 
 class ArchiveOrgIE(InfoExtractor):
     IE_NAME = 'archive.org'
     IE_DESC = 'archive.org videos'
-    _VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
-    _TEST = {
-        u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
-        u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
-        u'md5': u'8af1d4cf447933ed3c7f4871162602db',
-        u'info_dict': {
-            u"title": u"1968 Demo - FJCC Conference Presentation Reel #1",
-            u"description": u"Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> | <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
-            u"upload_date": u"19681210",
-            u"uploader": u"SRI International"
+    _VALID_URL = r'https?://(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
+    _TESTS = [{
+        'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
+        'md5': '8af1d4cf447933ed3c7f4871162602db',
+        'info_dict': {
+            'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
+            'ext': 'ogv',
+            'title': '1968 Demo - FJCC Conference Presentation Reel #1',
+            'description': 'md5:1780b464abaca9991d8968c877bb53ed',
+            'upload_date': '19681210',
+            'uploader': 'SRI International'
         }
-    }
-
+    }, {
+        'url': 'https://archive.org/details/Cops1922',
+        'md5': '18f2a19e6d89af8425671da1cf3d4e04',
+        'info_dict': {
+            'id': 'Cops1922',
+            'ext': 'ogv',
+            'title': 'Buster Keaton\'s "Cops" (1922)',
+            'description': 'md5:70f72ee70882f713d4578725461ffcc3',
+        }
+    }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
+
+        json_url = url + ('?' if '?' in url else '&') + 'output=json'
+        data = self._download_json(json_url, video_id)
 
-        json_url = url + (u'?' if u'?' in url else '&') + u'output=json'
-        json_data = self._download_webpage(json_url, video_id)
-        data = json.loads(json_data)
+        def get_optional(data_dict, field):
+            return data_dict['metadata'].get(field, [None])[0]
 
-        title = data['metadata']['title'][0]
-        description = data['metadata']['description'][0]
-        uploader = data['metadata']['creator'][0]
-        upload_date = unified_strdate(data['metadata']['date'][0])
+        title = get_optional(data, 'title')
+        description = get_optional(data, 'description')
+        uploader = get_optional(data, 'creator')
+        upload_date = unified_strdate(get_optional(data, 'date'))
 
-        formats = [{
+        formats = [
+            {
                 'format': fdata['format'],
                 'url': 'http://' + data['server'] + data['dir'] + fn,
                 'file_size': int(fdata['size']),
             }
-            for fn,fdata in data['files'].items()
+            for fn, fdata in data['files'].items()
             if 'Video' in fdata['format']]
-        formats.sort(key=lambda fdata: fdata['file_size'])
-        for f in formats:
-            f['ext'] = determine_ext(f['url'])
 
-        info = {
+        self._sort_formats(formats)
+
+        return {
             '_type': 'video',
             'id': video_id,
             'title': title,
@@ -57,12 +63,5 @@ class ArchiveOrgIE(InfoExtractor):
             'description': description,
             'uploader': uploader,
             'upload_date': upload_date,
+            'thumbnail': data.get('misc', {}).get('image'),
         }
-        thumbnail = data.get('misc', {}).get('image')
-        if thumbnail:
-            info['thumbnail'] = thumbnail
-
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-
-        return info