c319545c75d1f9da7b2f51ab9e63cfd616e46855
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.infoq import InfoQIE
38 from .extractor.justintv import JustinTVIE
39 from .extractor.keek import KeekIE
40 from .extractor.liveleak import LiveLeakIE
41 from .extractor.metacafe import MetacafeIE
42 from .extractor.mixcloud import MixcloudIE
43 from .extractor.mtv import MTVIE
44 from .extractor.myspass import MySpassIE
45 from .extractor.myvideo import MyVideoIE
46 from .extractor.nba import NBAIE
47 from .extractor.statigram import StatigramIE
48 from .extractor.photobucket import PhotobucketIE
49 from .extractor.pornotube import PornotubeIE
50 from .extractor.rbmaradio import RBMARadioIE
51 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
52 from .extractor.spiegel import SpiegelIE
53 from .extractor.stanfordoc import StanfordOpenClassroomIE
54 from .extractor.steam import SteamIE
55 from .extractor.ted import TEDIE
56 from .extractor.tumblr import TumblrIE
57 from .extractor.ustream import UstreamIE
58 from .extractor.vbox7 import Vbox7IE
59 from .extractor.vimeo import VimeoIE
60 from .extractor.vine import VineIE
61 from .extractor.worldstarhiphop import WorldStarHipHopIE
62 from .extractor.xnxx import XNXXIE
63 from .extractor.xvideos import XVideosIE
64 from .extractor.yahoo import YahooIE, YahooSearchIE
65 from .extractor.youjizz import YouJizzIE
66 from .extractor.youku import YoukuIE
67 from .extractor.youporn import YouPornIE
68 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
69 from .extractor.zdf import ZDFIE
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105 class RedTubeIE(InfoExtractor):
106     """Information Extractor for redtube"""
107     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
108
109     def _real_extract(self,url):
110         mobj = re.match(self._VALID_URL, url)
111         if mobj is None:
112             raise ExtractorError(u'Invalid URL: %s' % url)
113
114         video_id = mobj.group('id')
115         video_extension = 'mp4'        
116         webpage = self._download_webpage(url, video_id)
117
118         self.report_extraction(video_id)
119
120         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
121             webpage, u'video URL')
122
123         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
124             webpage, u'title')
125
126         return [{
127             'id':       video_id,
128             'url':      video_url,
129             'ext':      video_extension,
130             'title':    video_title,
131         }]
132         
133 class InaIE(InfoExtractor):
134     """Information Extractor for Ina.fr"""
135     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
136
137     def _real_extract(self,url):
138         mobj = re.match(self._VALID_URL, url)
139
140         video_id = mobj.group('id')
141         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
142         video_extension = 'mp4'
143         webpage = self._download_webpage(mrss_url, video_id)
144
145         self.report_extraction(video_id)
146
147         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
148             webpage, u'video URL')
149
150         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
151             webpage, u'title')
152
153         return [{
154             'id':       video_id,
155             'url':      video_url,
156             'ext':      video_extension,
157             'title':    video_title,
158         }]
159
160 class HowcastIE(InfoExtractor):
161     """Information Extractor for Howcast.com"""
162     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
163
164     def _real_extract(self, url):
165         mobj = re.match(self._VALID_URL, url)
166
167         video_id = mobj.group('id')
168         webpage_url = 'http://www.howcast.com/videos/' + video_id
169         webpage = self._download_webpage(webpage_url, video_id)
170
171         self.report_extraction(video_id)
172
173         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
174             webpage, u'video URL')
175
176         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
177             webpage, u'title')
178
179         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
180             webpage, u'description', fatal=False)
181
182         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
183             webpage, u'thumbnail', fatal=False)
184
185         return [{
186             'id':       video_id,
187             'url':      video_url,
188             'ext':      'mp4',
189             'title':    video_title,
190             'description': video_description,
191             'thumbnail': thumbnail,
192         }]
193
194
195 class FlickrIE(InfoExtractor):
196     """Information Extractor for Flickr videos"""
197     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
198
199     def _real_extract(self, url):
200         mobj = re.match(self._VALID_URL, url)
201
202         video_id = mobj.group('id')
203         video_uploader_id = mobj.group('uploader_id')
204         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
205         webpage = self._download_webpage(webpage_url, video_id)
206
207         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
208
209         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
210         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
211
212         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
213             first_xml, u'node_id')
214
215         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
216         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
217
218         self.report_extraction(video_id)
219
220         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
221         if mobj is None:
222             raise ExtractorError(u'Unable to extract video url')
223         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
224
225         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
226             webpage, u'video title')
227
228         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
229             webpage, u'description', fatal=False)
230
231         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
232             webpage, u'thumbnail', fatal=False)
233
234         return [{
235             'id':          video_id,
236             'url':         video_url,
237             'ext':         'mp4',
238             'title':       video_title,
239             'description': video_description,
240             'thumbnail':   thumbnail,
241             'uploader_id': video_uploader_id,
242         }]
243
244 class TeamcocoIE(InfoExtractor):
245     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
246
247     def _real_extract(self, url):
248         mobj = re.match(self._VALID_URL, url)
249         if mobj is None:
250             raise ExtractorError(u'Invalid URL: %s' % url)
251         url_title = mobj.group('url_title')
252         webpage = self._download_webpage(url, url_title)
253
254         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
255             webpage, u'video id')
256
257         self.report_extraction(video_id)
258
259         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
260             webpage, u'title')
261
262         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
263             webpage, u'thumbnail', fatal=False)
264
265         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
266             webpage, u'description', fatal=False)
267
268         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
269         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
270
271         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
272             data, u'video URL')
273
274         return [{
275             'id':          video_id,
276             'url':         video_url,
277             'ext':         'mp4',
278             'title':       video_title,
279             'thumbnail':   thumbnail,
280             'description': video_description,
281         }]
282
283 class XHamsterIE(InfoExtractor):
284     """Information Extractor for xHamster"""
285     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
286
287     def _real_extract(self,url):
288         mobj = re.match(self._VALID_URL, url)
289
290         video_id = mobj.group('id')
291         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
292         webpage = self._download_webpage(mrss_url, video_id)
293
294         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
295         if mobj is None:
296             raise ExtractorError(u'Unable to extract media URL')
297         if len(mobj.group('server')) == 0:
298             video_url = compat_urllib_parse.unquote(mobj.group('file'))
299         else:
300             video_url = mobj.group('server')+'/key='+mobj.group('file')
301         video_extension = video_url.split('.')[-1]
302
303         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
304             webpage, u'title')
305
306         # Can't see the description anywhere in the UI
307         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
308         #     webpage, u'description', fatal=False)
309         # if video_description: video_description = unescapeHTML(video_description)
310
311         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
312         if mobj:
313             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
314         else:
315             video_upload_date = None
316             self._downloader.report_warning(u'Unable to extract upload date')
317
318         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
319             webpage, u'uploader id', default=u'anonymous')
320
321         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
322             webpage, u'thumbnail', fatal=False)
323
324         return [{
325             'id':       video_id,
326             'url':      video_url,
327             'ext':      video_extension,
328             'title':    video_title,
329             # 'description': video_description,
330             'upload_date': video_upload_date,
331             'uploader_id': video_uploader_id,
332             'thumbnail': video_thumbnail
333         }]
334
335 class HypemIE(InfoExtractor):
336     """Information Extractor for hypem"""
337     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
338
339     def _real_extract(self, url):
340         mobj = re.match(self._VALID_URL, url)
341         if mobj is None:
342             raise ExtractorError(u'Invalid URL: %s' % url)
343         track_id = mobj.group(1)
344
345         data = { 'ax': 1, 'ts': time.time() }
346         data_encoded = compat_urllib_parse.urlencode(data)
347         complete_url = url + "?" + data_encoded
348         request = compat_urllib_request.Request(complete_url)
349         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
350         cookie = urlh.headers.get('Set-Cookie', '')
351
352         self.report_extraction(track_id)
353
354         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
355             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
356         try:
357             track_list = json.loads(html_tracks)
358             track = track_list[u'tracks'][0]
359         except ValueError:
360             raise ExtractorError(u'Hypemachine contained invalid JSON.')
361
362         key = track[u"key"]
363         track_id = track[u"id"]
364         artist = track[u"artist"]
365         title = track[u"song"]
366
367         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
368         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
369         request.add_header('cookie', cookie)
370         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
371         try:
372             song_data = json.loads(song_data_json)
373         except ValueError:
374             raise ExtractorError(u'Hypemachine contained invalid JSON.')
375         final_url = song_data[u"url"]
376
377         return [{
378             'id':       track_id,
379             'url':      final_url,
380             'ext':      "mp3",
381             'title':    title,
382             'artist':   artist,
383         }]
384
385
386
387 def gen_extractors():
388     """ Return a list of an instance of every supported extractor.
389     The order does matter; the first extractor matched is the one handling the URL.
390     """
391     return [
392         YoutubePlaylistIE(),
393         YoutubeChannelIE(),
394         YoutubeUserIE(),
395         YoutubeSearchIE(),
396         YoutubeIE(),
397         MetacafeIE(),
398         DailymotionIE(),
399         GoogleSearchIE(),
400         PhotobucketIE(),
401         YahooIE(),
402         YahooSearchIE(),
403         DepositFilesIE(),
404         FacebookIE(),
405         BlipTVIE(),
406         BlipTVUserIE(),
407         VimeoIE(),
408         MyVideoIE(),
409         ComedyCentralIE(),
410         EscapistIE(),
411         CollegeHumorIE(),
412         XVideosIE(),
413         SoundcloudSetIE(),
414         SoundcloudIE(),
415         InfoQIE(),
416         MixcloudIE(),
417         StanfordOpenClassroomIE(),
418         MTVIE(),
419         YoukuIE(),
420         XNXXIE(),
421         YouJizzIE(),
422         PornotubeIE(),
423         YouPornIE(),
424         GooglePlusIE(),
425         ArteTvIE(),
426         NBAIE(),
427         WorldStarHipHopIE(),
428         JustinTVIE(),
429         FunnyOrDieIE(),
430         SteamIE(),
431         UstreamIE(),
432         RBMARadioIE(),
433         EightTracksIE(),
434         KeekIE(),
435         TEDIE(),
436         MySpassIE(),
437         SpiegelIE(),
438         LiveLeakIE(),
439         ARDIE(),
440         ZDFIE(),
441         TumblrIE(),
442         BandcampIE(),
443         RedTubeIE(),
444         InaIE(),
445         HowcastIE(),
446         VineIE(),
447         FlickrIE(),
448         TeamcocoIE(),
449         XHamsterIE(),
450         HypemIE(),
451         Vbox7IE(),
452         GametrailersIE(),
453         StatigramIE(),
454         GenericIE()
455     ]
456
457 def get_info_extractor(ie_name):
458     """Returns the info extractor class with the given ie_name"""
459     return globals()[ie_name+'IE']