4bb4aa84a4e3d102822acd864309824d810b409c
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.hypem import HypemIE
38 from .extractor.ina import InaIE
39 from .extractor.infoq import InfoQIE
40 from .extractor.justintv import JustinTVIE
41 from .extractor.keek import KeekIE
42 from .extractor.liveleak import LiveLeakIE
43 from .extractor.metacafe import MetacafeIE
44 from .extractor.mixcloud import MixcloudIE
45 from .extractor.mtv import MTVIE
46 from .extractor.myspass import MySpassIE
47 from .extractor.myvideo import MyVideoIE
48 from .extractor.nba import NBAIE
49 from .extractor.statigram import StatigramIE
50 from .extractor.photobucket import PhotobucketIE
51 from .extractor.pornotube import PornotubeIE
52 from .extractor.rbmaradio import RBMARadioIE
53 from .extractor.redtube import RedTubeIE
54 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
55 from .extractor.spiegel import SpiegelIE
56 from .extractor.stanfordoc import StanfordOpenClassroomIE
57 from .extractor.steam import SteamIE
58 from .extractor.ted import TEDIE
59 from .extractor.tumblr import TumblrIE
60 from .extractor.ustream import UstreamIE
61 from .extractor.vbox7 import Vbox7IE
62 from .extractor.vimeo import VimeoIE
63 from .extractor.vine import VineIE
64 from .extractor.worldstarhiphop import WorldStarHipHopIE
65 from .extractor.xnxx import XNXXIE
66 from .extractor.xvideos import XVideosIE
67 from .extractor.yahoo import YahooIE, YahooSearchIE
68 from .extractor.youjizz import YouJizzIE
69 from .extractor.youku import YoukuIE
70 from .extractor.youporn import YouPornIE
71 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
72 from .extractor.zdf import ZDFIE
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109 class HowcastIE(InfoExtractor):
110     """Information Extractor for Howcast.com"""
111     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
112
113     def _real_extract(self, url):
114         mobj = re.match(self._VALID_URL, url)
115
116         video_id = mobj.group('id')
117         webpage_url = 'http://www.howcast.com/videos/' + video_id
118         webpage = self._download_webpage(webpage_url, video_id)
119
120         self.report_extraction(video_id)
121
122         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
123             webpage, u'video URL')
124
125         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
126             webpage, u'title')
127
128         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
129             webpage, u'description', fatal=False)
130
131         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
132             webpage, u'thumbnail', fatal=False)
133
134         return [{
135             'id':       video_id,
136             'url':      video_url,
137             'ext':      'mp4',
138             'title':    video_title,
139             'description': video_description,
140             'thumbnail': thumbnail,
141         }]
142
143
144 class FlickrIE(InfoExtractor):
145     """Information Extractor for Flickr videos"""
146     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
147
148     def _real_extract(self, url):
149         mobj = re.match(self._VALID_URL, url)
150
151         video_id = mobj.group('id')
152         video_uploader_id = mobj.group('uploader_id')
153         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
154         webpage = self._download_webpage(webpage_url, video_id)
155
156         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
157
158         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
159         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
160
161         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
162             first_xml, u'node_id')
163
164         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
165         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
166
167         self.report_extraction(video_id)
168
169         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
170         if mobj is None:
171             raise ExtractorError(u'Unable to extract video url')
172         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
173
174         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
175             webpage, u'video title')
176
177         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
178             webpage, u'description', fatal=False)
179
180         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
181             webpage, u'thumbnail', fatal=False)
182
183         return [{
184             'id':          video_id,
185             'url':         video_url,
186             'ext':         'mp4',
187             'title':       video_title,
188             'description': video_description,
189             'thumbnail':   thumbnail,
190             'uploader_id': video_uploader_id,
191         }]
192
193 class TeamcocoIE(InfoExtractor):
194     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
195
196     def _real_extract(self, url):
197         mobj = re.match(self._VALID_URL, url)
198         if mobj is None:
199             raise ExtractorError(u'Invalid URL: %s' % url)
200         url_title = mobj.group('url_title')
201         webpage = self._download_webpage(url, url_title)
202
203         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
204             webpage, u'video id')
205
206         self.report_extraction(video_id)
207
208         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
209             webpage, u'title')
210
211         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
212             webpage, u'thumbnail', fatal=False)
213
214         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
215             webpage, u'description', fatal=False)
216
217         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
218         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
219
220         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
221             data, u'video URL')
222
223         return [{
224             'id':          video_id,
225             'url':         video_url,
226             'ext':         'mp4',
227             'title':       video_title,
228             'thumbnail':   thumbnail,
229             'description': video_description,
230         }]
231
232 class XHamsterIE(InfoExtractor):
233     """Information Extractor for xHamster"""
234     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
235
236     def _real_extract(self,url):
237         mobj = re.match(self._VALID_URL, url)
238
239         video_id = mobj.group('id')
240         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
241         webpage = self._download_webpage(mrss_url, video_id)
242
243         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
244         if mobj is None:
245             raise ExtractorError(u'Unable to extract media URL')
246         if len(mobj.group('server')) == 0:
247             video_url = compat_urllib_parse.unquote(mobj.group('file'))
248         else:
249             video_url = mobj.group('server')+'/key='+mobj.group('file')
250         video_extension = video_url.split('.')[-1]
251
252         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
253             webpage, u'title')
254
255         # Can't see the description anywhere in the UI
256         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
257         #     webpage, u'description', fatal=False)
258         # if video_description: video_description = unescapeHTML(video_description)
259
260         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
261         if mobj:
262             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
263         else:
264             video_upload_date = None
265             self._downloader.report_warning(u'Unable to extract upload date')
266
267         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
268             webpage, u'uploader id', default=u'anonymous')
269
270         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
271             webpage, u'thumbnail', fatal=False)
272
273         return [{
274             'id':       video_id,
275             'url':      video_url,
276             'ext':      video_extension,
277             'title':    video_title,
278             # 'description': video_description,
279             'upload_date': video_upload_date,
280             'uploader_id': video_uploader_id,
281             'thumbnail': video_thumbnail
282         }]
283
284
285
286
287
288 def gen_extractors():
289     """ Return a list of an instance of every supported extractor.
290     The order does matter; the first extractor matched is the one handling the URL.
291     """
292     return [
293         YoutubePlaylistIE(),
294         YoutubeChannelIE(),
295         YoutubeUserIE(),
296         YoutubeSearchIE(),
297         YoutubeIE(),
298         MetacafeIE(),
299         DailymotionIE(),
300         GoogleSearchIE(),
301         PhotobucketIE(),
302         YahooIE(),
303         YahooSearchIE(),
304         DepositFilesIE(),
305         FacebookIE(),
306         BlipTVIE(),
307         BlipTVUserIE(),
308         VimeoIE(),
309         MyVideoIE(),
310         ComedyCentralIE(),
311         EscapistIE(),
312         CollegeHumorIE(),
313         XVideosIE(),
314         SoundcloudSetIE(),
315         SoundcloudIE(),
316         InfoQIE(),
317         MixcloudIE(),
318         StanfordOpenClassroomIE(),
319         MTVIE(),
320         YoukuIE(),
321         XNXXIE(),
322         YouJizzIE(),
323         PornotubeIE(),
324         YouPornIE(),
325         GooglePlusIE(),
326         ArteTvIE(),
327         NBAIE(),
328         WorldStarHipHopIE(),
329         JustinTVIE(),
330         FunnyOrDieIE(),
331         SteamIE(),
332         UstreamIE(),
333         RBMARadioIE(),
334         EightTracksIE(),
335         KeekIE(),
336         TEDIE(),
337         MySpassIE(),
338         SpiegelIE(),
339         LiveLeakIE(),
340         ARDIE(),
341         ZDFIE(),
342         TumblrIE(),
343         BandcampIE(),
344         RedTubeIE(),
345         InaIE(),
346         HowcastIE(),
347         VineIE(),
348         FlickrIE(),
349         TeamcocoIE(),
350         XHamsterIE(),
351         HypemIE(),
352         Vbox7IE(),
353         GametrailersIE(),
354         StatigramIE(),
355         GenericIE()
356     ]
357
358 def get_info_extractor(ie_name):
359     """Returns the info extractor class with the given ie_name"""
360     return globals()[ie_name+'IE']