[redtube] move into own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.infoq import InfoQIE
38 from .extractor.justintv import JustinTVIE
39 from .extractor.keek import KeekIE
40 from .extractor.liveleak import LiveLeakIE
41 from .extractor.metacafe import MetacafeIE
42 from .extractor.mixcloud import MixcloudIE
43 from .extractor.mtv import MTVIE
44 from .extractor.myspass import MySpassIE
45 from .extractor.myvideo import MyVideoIE
46 from .extractor.nba import NBAIE
47 from .extractor.statigram import StatigramIE
48 from .extractor.photobucket import PhotobucketIE
49 from .extractor.pornotube import PornotubeIE
50 from .extractor.rbmaradio import RBMARadioIE
51 from .extractor.redtube import RedTubeIE
52 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
53 from .extractor.spiegel import SpiegelIE
54 from .extractor.stanfordoc import StanfordOpenClassroomIE
55 from .extractor.steam import SteamIE
56 from .extractor.ted import TEDIE
57 from .extractor.tumblr import TumblrIE
58 from .extractor.ustream import UstreamIE
59 from .extractor.vbox7 import Vbox7IE
60 from .extractor.vimeo import VimeoIE
61 from .extractor.vine import VineIE
62 from .extractor.worldstarhiphop import WorldStarHipHopIE
63 from .extractor.xnxx import XNXXIE
64 from .extractor.xvideos import XVideosIE
65 from .extractor.yahoo import YahooIE, YahooSearchIE
66 from .extractor.youjizz import YouJizzIE
67 from .extractor.youku import YoukuIE
68 from .extractor.youporn import YouPornIE
69 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
70 from .extractor.zdf import ZDFIE
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106 class InaIE(InfoExtractor):
107     """Information Extractor for Ina.fr"""
108     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
109
110     def _real_extract(self,url):
111         mobj = re.match(self._VALID_URL, url)
112
113         video_id = mobj.group('id')
114         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
115         video_extension = 'mp4'
116         webpage = self._download_webpage(mrss_url, video_id)
117
118         self.report_extraction(video_id)
119
120         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
121             webpage, u'video URL')
122
123         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
124             webpage, u'title')
125
126         return [{
127             'id':       video_id,
128             'url':      video_url,
129             'ext':      video_extension,
130             'title':    video_title,
131         }]
132
133 class HowcastIE(InfoExtractor):
134     """Information Extractor for Howcast.com"""
135     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
136
137     def _real_extract(self, url):
138         mobj = re.match(self._VALID_URL, url)
139
140         video_id = mobj.group('id')
141         webpage_url = 'http://www.howcast.com/videos/' + video_id
142         webpage = self._download_webpage(webpage_url, video_id)
143
144         self.report_extraction(video_id)
145
146         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
147             webpage, u'video URL')
148
149         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
150             webpage, u'title')
151
152         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
153             webpage, u'description', fatal=False)
154
155         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
156             webpage, u'thumbnail', fatal=False)
157
158         return [{
159             'id':       video_id,
160             'url':      video_url,
161             'ext':      'mp4',
162             'title':    video_title,
163             'description': video_description,
164             'thumbnail': thumbnail,
165         }]
166
167
168 class FlickrIE(InfoExtractor):
169     """Information Extractor for Flickr videos"""
170     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
171
172     def _real_extract(self, url):
173         mobj = re.match(self._VALID_URL, url)
174
175         video_id = mobj.group('id')
176         video_uploader_id = mobj.group('uploader_id')
177         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
178         webpage = self._download_webpage(webpage_url, video_id)
179
180         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
181
182         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
183         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
184
185         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
186             first_xml, u'node_id')
187
188         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
189         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
190
191         self.report_extraction(video_id)
192
193         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
194         if mobj is None:
195             raise ExtractorError(u'Unable to extract video url')
196         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
197
198         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
199             webpage, u'video title')
200
201         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
202             webpage, u'description', fatal=False)
203
204         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
205             webpage, u'thumbnail', fatal=False)
206
207         return [{
208             'id':          video_id,
209             'url':         video_url,
210             'ext':         'mp4',
211             'title':       video_title,
212             'description': video_description,
213             'thumbnail':   thumbnail,
214             'uploader_id': video_uploader_id,
215         }]
216
217 class TeamcocoIE(InfoExtractor):
218     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
219
220     def _real_extract(self, url):
221         mobj = re.match(self._VALID_URL, url)
222         if mobj is None:
223             raise ExtractorError(u'Invalid URL: %s' % url)
224         url_title = mobj.group('url_title')
225         webpage = self._download_webpage(url, url_title)
226
227         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
228             webpage, u'video id')
229
230         self.report_extraction(video_id)
231
232         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
233             webpage, u'title')
234
235         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
236             webpage, u'thumbnail', fatal=False)
237
238         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
239             webpage, u'description', fatal=False)
240
241         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
242         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
243
244         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
245             data, u'video URL')
246
247         return [{
248             'id':          video_id,
249             'url':         video_url,
250             'ext':         'mp4',
251             'title':       video_title,
252             'thumbnail':   thumbnail,
253             'description': video_description,
254         }]
255
256 class XHamsterIE(InfoExtractor):
257     """Information Extractor for xHamster"""
258     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
259
260     def _real_extract(self,url):
261         mobj = re.match(self._VALID_URL, url)
262
263         video_id = mobj.group('id')
264         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
265         webpage = self._download_webpage(mrss_url, video_id)
266
267         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
268         if mobj is None:
269             raise ExtractorError(u'Unable to extract media URL')
270         if len(mobj.group('server')) == 0:
271             video_url = compat_urllib_parse.unquote(mobj.group('file'))
272         else:
273             video_url = mobj.group('server')+'/key='+mobj.group('file')
274         video_extension = video_url.split('.')[-1]
275
276         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
277             webpage, u'title')
278
279         # Can't see the description anywhere in the UI
280         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
281         #     webpage, u'description', fatal=False)
282         # if video_description: video_description = unescapeHTML(video_description)
283
284         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
285         if mobj:
286             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
287         else:
288             video_upload_date = None
289             self._downloader.report_warning(u'Unable to extract upload date')
290
291         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
292             webpage, u'uploader id', default=u'anonymous')
293
294         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
295             webpage, u'thumbnail', fatal=False)
296
297         return [{
298             'id':       video_id,
299             'url':      video_url,
300             'ext':      video_extension,
301             'title':    video_title,
302             # 'description': video_description,
303             'upload_date': video_upload_date,
304             'uploader_id': video_uploader_id,
305             'thumbnail': video_thumbnail
306         }]
307
308 class HypemIE(InfoExtractor):
309     """Information Extractor for hypem"""
310     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
311
312     def _real_extract(self, url):
313         mobj = re.match(self._VALID_URL, url)
314         if mobj is None:
315             raise ExtractorError(u'Invalid URL: %s' % url)
316         track_id = mobj.group(1)
317
318         data = { 'ax': 1, 'ts': time.time() }
319         data_encoded = compat_urllib_parse.urlencode(data)
320         complete_url = url + "?" + data_encoded
321         request = compat_urllib_request.Request(complete_url)
322         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
323         cookie = urlh.headers.get('Set-Cookie', '')
324
325         self.report_extraction(track_id)
326
327         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
328             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
329         try:
330             track_list = json.loads(html_tracks)
331             track = track_list[u'tracks'][0]
332         except ValueError:
333             raise ExtractorError(u'Hypemachine contained invalid JSON.')
334
335         key = track[u"key"]
336         track_id = track[u"id"]
337         artist = track[u"artist"]
338         title = track[u"song"]
339
340         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
341         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
342         request.add_header('cookie', cookie)
343         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
344         try:
345             song_data = json.loads(song_data_json)
346         except ValueError:
347             raise ExtractorError(u'Hypemachine contained invalid JSON.')
348         final_url = song_data[u"url"]
349
350         return [{
351             'id':       track_id,
352             'url':      final_url,
353             'ext':      "mp3",
354             'title':    title,
355             'artist':   artist,
356         }]
357
358
359
360 def gen_extractors():
361     """ Return a list of an instance of every supported extractor.
362     The order does matter; the first extractor matched is the one handling the URL.
363     """
364     return [
365         YoutubePlaylistIE(),
366         YoutubeChannelIE(),
367         YoutubeUserIE(),
368         YoutubeSearchIE(),
369         YoutubeIE(),
370         MetacafeIE(),
371         DailymotionIE(),
372         GoogleSearchIE(),
373         PhotobucketIE(),
374         YahooIE(),
375         YahooSearchIE(),
376         DepositFilesIE(),
377         FacebookIE(),
378         BlipTVIE(),
379         BlipTVUserIE(),
380         VimeoIE(),
381         MyVideoIE(),
382         ComedyCentralIE(),
383         EscapistIE(),
384         CollegeHumorIE(),
385         XVideosIE(),
386         SoundcloudSetIE(),
387         SoundcloudIE(),
388         InfoQIE(),
389         MixcloudIE(),
390         StanfordOpenClassroomIE(),
391         MTVIE(),
392         YoukuIE(),
393         XNXXIE(),
394         YouJizzIE(),
395         PornotubeIE(),
396         YouPornIE(),
397         GooglePlusIE(),
398         ArteTvIE(),
399         NBAIE(),
400         WorldStarHipHopIE(),
401         JustinTVIE(),
402         FunnyOrDieIE(),
403         SteamIE(),
404         UstreamIE(),
405         RBMARadioIE(),
406         EightTracksIE(),
407         KeekIE(),
408         TEDIE(),
409         MySpassIE(),
410         SpiegelIE(),
411         LiveLeakIE(),
412         ARDIE(),
413         ZDFIE(),
414         TumblrIE(),
415         BandcampIE(),
416         RedTubeIE(),
417         InaIE(),
418         HowcastIE(),
419         VineIE(),
420         FlickrIE(),
421         TeamcocoIE(),
422         XHamsterIE(),
423         HypemIE(),
424         Vbox7IE(),
425         GametrailersIE(),
426         StatigramIE(),
427         GenericIE()
428     ]
429
430 def get_info_extractor(ie_name):
431     """Returns the info extractor class with the given ie_name"""
432     return globals()[ie_name+'IE']