[DHM] Add new extractor
[youtube-dl] / youtube_dl / extractor / dhm.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 from .common import InfoExtractor
5
6 import urllib2
7 import xml.etree.ElementTree as ET
8 import re
9
10
11 class DHMIE(InfoExtractor):
12     _VALID_URL = r'http://www\.dhm\.de/filmarchiv/(?P<id>.*?)'
13
14     _TEST = {
15         'url': 'http://www.dhm.de/filmarchiv/die-filme/the-marshallplan-at-work-in-west-germany/',
16         'md5': '11c475f670209bf6acca0b2b7ef51827',
17         'info_dict': {
18             'id': 'marshallwg',
19             'ext': 'flv',
20             'title': 'MARSHALL PLAN AT WORK IN WESTERN GERMANY, THE',
21             'thumbnail': 'http://www.dhm.de/filmarchiv/video/mpworkwg.jpg',
22         }
23     }
24
25     def _real_extract(self, url):
26         video_id = ''
27         webpage = self._download_webpage(url, video_id)
28
29         title = self._html_search_regex(
30             r'dc:title=\"(.*?)\"', webpage, 'title')
31
32         playlist_url = self._html_search_regex(
33             r'file: \'(.*?)\'', webpage, 'playlist URL')
34
35         xml_file = urllib2.urlopen(playlist_url)
36         data = xml_file.read()
37         xml_file.close()
38
39         root = ET.fromstring(data)
40         video_url = root[0][0][0].text
41         thumbnail = root[0][0][2].text
42
43         m = re.search('video/(.+?).flv', video_url)
44         if m:
45             video_id = m.group(1)
46
47         return {
48             'id': video_id,
49             'title': title,
50             'url': video_url,
51             'thumbnail': thumbnail,
52         }