995dae062e552e98ddb28028e7e55a65a1c580b0
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.infoq import InfoQIE
38 from .extractor.justintv import JustinTVIE
39 from .extractor.keek import KeekIE
40 from .extractor.liveleak import LiveLeakIE
41 from .extractor.metacafe import MetacafeIE
42 from .extractor.mixcloud import MixcloudIE
43 from .extractor.mtv import MTVIE
44 from .extractor.myspass import MySpassIE
45 from .extractor.myvideo import MyVideoIE
46 from .extractor.nba import NBAIE
47 from .extractor.statigram import StatigramIE
48 from .extractor.photobucket import PhotobucketIE
49 from .extractor.pornotube import PornotubeIE
50 from .extractor.rbmaradio import RBMARadioIE
51 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
52 from .extractor.spiegel import SpiegelIE
53 from .extractor.stanfordoc import StanfordOpenClassroomIE
54 from .extractor.steam import SteamIE
55 from .extractor.ted import TEDIE
56 from .extractor.tumblr import TumblrIE
57 from .extractor.ustream import UstreamIE
58 from .extractor.vbox7 import Vbox7IE
59 from .extractor.vimeo import VimeoIE
60 from .extractor.worldstarhiphop import WorldStarHipHopIE
61 from .extractor.xnxx import XNXXIE
62 from .extractor.xvideos import XVideosIE
63 from .extractor.yahoo import YahooIE, YahooSearchIE
64 from .extractor.youjizz import YouJizzIE
65 from .extractor.youku import YoukuIE
66 from .extractor.youporn import YouPornIE
67 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
68 from .extractor.zdf import ZDFIE
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104 class RedTubeIE(InfoExtractor):
105     """Information Extractor for redtube"""
106     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
107
108     def _real_extract(self,url):
109         mobj = re.match(self._VALID_URL, url)
110         if mobj is None:
111             raise ExtractorError(u'Invalid URL: %s' % url)
112
113         video_id = mobj.group('id')
114         video_extension = 'mp4'        
115         webpage = self._download_webpage(url, video_id)
116
117         self.report_extraction(video_id)
118
119         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
120             webpage, u'video URL')
121
122         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
123             webpage, u'title')
124
125         return [{
126             'id':       video_id,
127             'url':      video_url,
128             'ext':      video_extension,
129             'title':    video_title,
130         }]
131         
132 class InaIE(InfoExtractor):
133     """Information Extractor for Ina.fr"""
134     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
135
136     def _real_extract(self,url):
137         mobj = re.match(self._VALID_URL, url)
138
139         video_id = mobj.group('id')
140         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
141         video_extension = 'mp4'
142         webpage = self._download_webpage(mrss_url, video_id)
143
144         self.report_extraction(video_id)
145
146         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
147             webpage, u'video URL')
148
149         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
150             webpage, u'title')
151
152         return [{
153             'id':       video_id,
154             'url':      video_url,
155             'ext':      video_extension,
156             'title':    video_title,
157         }]
158
159 class HowcastIE(InfoExtractor):
160     """Information Extractor for Howcast.com"""
161     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
162
163     def _real_extract(self, url):
164         mobj = re.match(self._VALID_URL, url)
165
166         video_id = mobj.group('id')
167         webpage_url = 'http://www.howcast.com/videos/' + video_id
168         webpage = self._download_webpage(webpage_url, video_id)
169
170         self.report_extraction(video_id)
171
172         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
173             webpage, u'video URL')
174
175         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
176             webpage, u'title')
177
178         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
179             webpage, u'description', fatal=False)
180
181         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
182             webpage, u'thumbnail', fatal=False)
183
184         return [{
185             'id':       video_id,
186             'url':      video_url,
187             'ext':      'mp4',
188             'title':    video_title,
189             'description': video_description,
190             'thumbnail': thumbnail,
191         }]
192
193 class VineIE(InfoExtractor):
194     """Information Extractor for Vine.co"""
195     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
196
197     def _real_extract(self, url):
198         mobj = re.match(self._VALID_URL, url)
199
200         video_id = mobj.group('id')
201         webpage_url = 'https://vine.co/v/' + video_id
202         webpage = self._download_webpage(webpage_url, video_id)
203
204         self.report_extraction(video_id)
205
206         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
207             webpage, u'video URL')
208
209         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
210             webpage, u'title')
211
212         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
213             webpage, u'thumbnail', fatal=False)
214
215         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
216             webpage, u'uploader', fatal=False, flags=re.DOTALL)
217
218         return [{
219             'id':        video_id,
220             'url':       video_url,
221             'ext':       'mp4',
222             'title':     video_title,
223             'thumbnail': thumbnail,
224             'uploader':  uploader,
225         }]
226
227 class FlickrIE(InfoExtractor):
228     """Information Extractor for Flickr videos"""
229     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
230
231     def _real_extract(self, url):
232         mobj = re.match(self._VALID_URL, url)
233
234         video_id = mobj.group('id')
235         video_uploader_id = mobj.group('uploader_id')
236         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
237         webpage = self._download_webpage(webpage_url, video_id)
238
239         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
240
241         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
242         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
243
244         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
245             first_xml, u'node_id')
246
247         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
248         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
249
250         self.report_extraction(video_id)
251
252         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
253         if mobj is None:
254             raise ExtractorError(u'Unable to extract video url')
255         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
256
257         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
258             webpage, u'video title')
259
260         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
261             webpage, u'description', fatal=False)
262
263         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
264             webpage, u'thumbnail', fatal=False)
265
266         return [{
267             'id':          video_id,
268             'url':         video_url,
269             'ext':         'mp4',
270             'title':       video_title,
271             'description': video_description,
272             'thumbnail':   thumbnail,
273             'uploader_id': video_uploader_id,
274         }]
275
276 class TeamcocoIE(InfoExtractor):
277     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
278
279     def _real_extract(self, url):
280         mobj = re.match(self._VALID_URL, url)
281         if mobj is None:
282             raise ExtractorError(u'Invalid URL: %s' % url)
283         url_title = mobj.group('url_title')
284         webpage = self._download_webpage(url, url_title)
285
286         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
287             webpage, u'video id')
288
289         self.report_extraction(video_id)
290
291         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
292             webpage, u'title')
293
294         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
295             webpage, u'thumbnail', fatal=False)
296
297         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
298             webpage, u'description', fatal=False)
299
300         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
301         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
302
303         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
304             data, u'video URL')
305
306         return [{
307             'id':          video_id,
308             'url':         video_url,
309             'ext':         'mp4',
310             'title':       video_title,
311             'thumbnail':   thumbnail,
312             'description': video_description,
313         }]
314
315 class XHamsterIE(InfoExtractor):
316     """Information Extractor for xHamster"""
317     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
318
319     def _real_extract(self,url):
320         mobj = re.match(self._VALID_URL, url)
321
322         video_id = mobj.group('id')
323         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
324         webpage = self._download_webpage(mrss_url, video_id)
325
326         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
327         if mobj is None:
328             raise ExtractorError(u'Unable to extract media URL')
329         if len(mobj.group('server')) == 0:
330             video_url = compat_urllib_parse.unquote(mobj.group('file'))
331         else:
332             video_url = mobj.group('server')+'/key='+mobj.group('file')
333         video_extension = video_url.split('.')[-1]
334
335         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
336             webpage, u'title')
337
338         # Can't see the description anywhere in the UI
339         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
340         #     webpage, u'description', fatal=False)
341         # if video_description: video_description = unescapeHTML(video_description)
342
343         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
344         if mobj:
345             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
346         else:
347             video_upload_date = None
348             self._downloader.report_warning(u'Unable to extract upload date')
349
350         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
351             webpage, u'uploader id', default=u'anonymous')
352
353         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
354             webpage, u'thumbnail', fatal=False)
355
356         return [{
357             'id':       video_id,
358             'url':      video_url,
359             'ext':      video_extension,
360             'title':    video_title,
361             # 'description': video_description,
362             'upload_date': video_upload_date,
363             'uploader_id': video_uploader_id,
364             'thumbnail': video_thumbnail
365         }]
366
367 class HypemIE(InfoExtractor):
368     """Information Extractor for hypem"""
369     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
370
371     def _real_extract(self, url):
372         mobj = re.match(self._VALID_URL, url)
373         if mobj is None:
374             raise ExtractorError(u'Invalid URL: %s' % url)
375         track_id = mobj.group(1)
376
377         data = { 'ax': 1, 'ts': time.time() }
378         data_encoded = compat_urllib_parse.urlencode(data)
379         complete_url = url + "?" + data_encoded
380         request = compat_urllib_request.Request(complete_url)
381         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
382         cookie = urlh.headers.get('Set-Cookie', '')
383
384         self.report_extraction(track_id)
385
386         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
387             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
388         try:
389             track_list = json.loads(html_tracks)
390             track = track_list[u'tracks'][0]
391         except ValueError:
392             raise ExtractorError(u'Hypemachine contained invalid JSON.')
393
394         key = track[u"key"]
395         track_id = track[u"id"]
396         artist = track[u"artist"]
397         title = track[u"song"]
398
399         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
400         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
401         request.add_header('cookie', cookie)
402         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
403         try:
404             song_data = json.loads(song_data_json)
405         except ValueError:
406             raise ExtractorError(u'Hypemachine contained invalid JSON.')
407         final_url = song_data[u"url"]
408
409         return [{
410             'id':       track_id,
411             'url':      final_url,
412             'ext':      "mp3",
413             'title':    title,
414             'artist':   artist,
415         }]
416
417
418
419 def gen_extractors():
420     """ Return a list of an instance of every supported extractor.
421     The order does matter; the first extractor matched is the one handling the URL.
422     """
423     return [
424         YoutubePlaylistIE(),
425         YoutubeChannelIE(),
426         YoutubeUserIE(),
427         YoutubeSearchIE(),
428         YoutubeIE(),
429         MetacafeIE(),
430         DailymotionIE(),
431         GoogleSearchIE(),
432         PhotobucketIE(),
433         YahooIE(),
434         YahooSearchIE(),
435         DepositFilesIE(),
436         FacebookIE(),
437         BlipTVIE(),
438         BlipTVUserIE(),
439         VimeoIE(),
440         MyVideoIE(),
441         ComedyCentralIE(),
442         EscapistIE(),
443         CollegeHumorIE(),
444         XVideosIE(),
445         SoundcloudSetIE(),
446         SoundcloudIE(),
447         InfoQIE(),
448         MixcloudIE(),
449         StanfordOpenClassroomIE(),
450         MTVIE(),
451         YoukuIE(),
452         XNXXIE(),
453         YouJizzIE(),
454         PornotubeIE(),
455         YouPornIE(),
456         GooglePlusIE(),
457         ArteTvIE(),
458         NBAIE(),
459         WorldStarHipHopIE(),
460         JustinTVIE(),
461         FunnyOrDieIE(),
462         SteamIE(),
463         UstreamIE(),
464         RBMARadioIE(),
465         EightTracksIE(),
466         KeekIE(),
467         TEDIE(),
468         MySpassIE(),
469         SpiegelIE(),
470         LiveLeakIE(),
471         ARDIE(),
472         ZDFIE(),
473         TumblrIE(),
474         BandcampIE(),
475         RedTubeIE(),
476         InaIE(),
477         HowcastIE(),
478         VineIE(),
479         FlickrIE(),
480         TeamcocoIE(),
481         XHamsterIE(),
482         HypemIE(),
483         Vbox7IE(),
484         GametrailersIE(),
485         StatigramIE(),
486         GenericIE()
487     ]
488
489 def get_info_extractor(ie_name):
490     """Returns the info extractor class with the given ie_name"""
491     return globals()[ie_name+'IE']