d5167fef966c6c42b66379fd6f0ebccb1ea6e020
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.ina import InaIE
38 from .extractor.infoq import InfoQIE
39 from .extractor.justintv import JustinTVIE
40 from .extractor.keek import KeekIE
41 from .extractor.liveleak import LiveLeakIE
42 from .extractor.metacafe import MetacafeIE
43 from .extractor.mixcloud import MixcloudIE
44 from .extractor.mtv import MTVIE
45 from .extractor.myspass import MySpassIE
46 from .extractor.myvideo import MyVideoIE
47 from .extractor.nba import NBAIE
48 from .extractor.statigram import StatigramIE
49 from .extractor.photobucket import PhotobucketIE
50 from .extractor.pornotube import PornotubeIE
51 from .extractor.rbmaradio import RBMARadioIE
52 from .extractor.redtube import RedTubeIE
53 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
54 from .extractor.spiegel import SpiegelIE
55 from .extractor.stanfordoc import StanfordOpenClassroomIE
56 from .extractor.steam import SteamIE
57 from .extractor.ted import TEDIE
58 from .extractor.tumblr import TumblrIE
59 from .extractor.ustream import UstreamIE
60 from .extractor.vbox7 import Vbox7IE
61 from .extractor.vimeo import VimeoIE
62 from .extractor.vine import VineIE
63 from .extractor.worldstarhiphop import WorldStarHipHopIE
64 from .extractor.xnxx import XNXXIE
65 from .extractor.xvideos import XVideosIE
66 from .extractor.yahoo import YahooIE, YahooSearchIE
67 from .extractor.youjizz import YouJizzIE
68 from .extractor.youku import YoukuIE
69 from .extractor.youporn import YouPornIE
70 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
71 from .extractor.zdf import ZDFIE
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108 class HowcastIE(InfoExtractor):
109     """Information Extractor for Howcast.com"""
110     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
111
112     def _real_extract(self, url):
113         mobj = re.match(self._VALID_URL, url)
114
115         video_id = mobj.group('id')
116         webpage_url = 'http://www.howcast.com/videos/' + video_id
117         webpage = self._download_webpage(webpage_url, video_id)
118
119         self.report_extraction(video_id)
120
121         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
122             webpage, u'video URL')
123
124         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
125             webpage, u'title')
126
127         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
128             webpage, u'description', fatal=False)
129
130         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
131             webpage, u'thumbnail', fatal=False)
132
133         return [{
134             'id':       video_id,
135             'url':      video_url,
136             'ext':      'mp4',
137             'title':    video_title,
138             'description': video_description,
139             'thumbnail': thumbnail,
140         }]
141
142
143 class FlickrIE(InfoExtractor):
144     """Information Extractor for Flickr videos"""
145     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
146
147     def _real_extract(self, url):
148         mobj = re.match(self._VALID_URL, url)
149
150         video_id = mobj.group('id')
151         video_uploader_id = mobj.group('uploader_id')
152         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
153         webpage = self._download_webpage(webpage_url, video_id)
154
155         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
156
157         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
158         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
159
160         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
161             first_xml, u'node_id')
162
163         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
164         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
165
166         self.report_extraction(video_id)
167
168         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
169         if mobj is None:
170             raise ExtractorError(u'Unable to extract video url')
171         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
172
173         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
174             webpage, u'video title')
175
176         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
177             webpage, u'description', fatal=False)
178
179         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
180             webpage, u'thumbnail', fatal=False)
181
182         return [{
183             'id':          video_id,
184             'url':         video_url,
185             'ext':         'mp4',
186             'title':       video_title,
187             'description': video_description,
188             'thumbnail':   thumbnail,
189             'uploader_id': video_uploader_id,
190         }]
191
192 class TeamcocoIE(InfoExtractor):
193     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
194
195     def _real_extract(self, url):
196         mobj = re.match(self._VALID_URL, url)
197         if mobj is None:
198             raise ExtractorError(u'Invalid URL: %s' % url)
199         url_title = mobj.group('url_title')
200         webpage = self._download_webpage(url, url_title)
201
202         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
203             webpage, u'video id')
204
205         self.report_extraction(video_id)
206
207         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
208             webpage, u'title')
209
210         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
211             webpage, u'thumbnail', fatal=False)
212
213         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
214             webpage, u'description', fatal=False)
215
216         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
217         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
218
219         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
220             data, u'video URL')
221
222         return [{
223             'id':          video_id,
224             'url':         video_url,
225             'ext':         'mp4',
226             'title':       video_title,
227             'thumbnail':   thumbnail,
228             'description': video_description,
229         }]
230
231 class XHamsterIE(InfoExtractor):
232     """Information Extractor for xHamster"""
233     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
234
235     def _real_extract(self,url):
236         mobj = re.match(self._VALID_URL, url)
237
238         video_id = mobj.group('id')
239         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
240         webpage = self._download_webpage(mrss_url, video_id)
241
242         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
243         if mobj is None:
244             raise ExtractorError(u'Unable to extract media URL')
245         if len(mobj.group('server')) == 0:
246             video_url = compat_urllib_parse.unquote(mobj.group('file'))
247         else:
248             video_url = mobj.group('server')+'/key='+mobj.group('file')
249         video_extension = video_url.split('.')[-1]
250
251         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
252             webpage, u'title')
253
254         # Can't see the description anywhere in the UI
255         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
256         #     webpage, u'description', fatal=False)
257         # if video_description: video_description = unescapeHTML(video_description)
258
259         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
260         if mobj:
261             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
262         else:
263             video_upload_date = None
264             self._downloader.report_warning(u'Unable to extract upload date')
265
266         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
267             webpage, u'uploader id', default=u'anonymous')
268
269         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
270             webpage, u'thumbnail', fatal=False)
271
272         return [{
273             'id':       video_id,
274             'url':      video_url,
275             'ext':      video_extension,
276             'title':    video_title,
277             # 'description': video_description,
278             'upload_date': video_upload_date,
279             'uploader_id': video_uploader_id,
280             'thumbnail': video_thumbnail
281         }]
282
283 class HypemIE(InfoExtractor):
284     """Information Extractor for hypem"""
285     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
286
287     def _real_extract(self, url):
288         mobj = re.match(self._VALID_URL, url)
289         if mobj is None:
290             raise ExtractorError(u'Invalid URL: %s' % url)
291         track_id = mobj.group(1)
292
293         data = { 'ax': 1, 'ts': time.time() }
294         data_encoded = compat_urllib_parse.urlencode(data)
295         complete_url = url + "?" + data_encoded
296         request = compat_urllib_request.Request(complete_url)
297         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
298         cookie = urlh.headers.get('Set-Cookie', '')
299
300         self.report_extraction(track_id)
301
302         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
303             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
304         try:
305             track_list = json.loads(html_tracks)
306             track = track_list[u'tracks'][0]
307         except ValueError:
308             raise ExtractorError(u'Hypemachine contained invalid JSON.')
309
310         key = track[u"key"]
311         track_id = track[u"id"]
312         artist = track[u"artist"]
313         title = track[u"song"]
314
315         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
316         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
317         request.add_header('cookie', cookie)
318         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
319         try:
320             song_data = json.loads(song_data_json)
321         except ValueError:
322             raise ExtractorError(u'Hypemachine contained invalid JSON.')
323         final_url = song_data[u"url"]
324
325         return [{
326             'id':       track_id,
327             'url':      final_url,
328             'ext':      "mp3",
329             'title':    title,
330             'artist':   artist,
331         }]
332
333
334
335 def gen_extractors():
336     """ Return a list of an instance of every supported extractor.
337     The order does matter; the first extractor matched is the one handling the URL.
338     """
339     return [
340         YoutubePlaylistIE(),
341         YoutubeChannelIE(),
342         YoutubeUserIE(),
343         YoutubeSearchIE(),
344         YoutubeIE(),
345         MetacafeIE(),
346         DailymotionIE(),
347         GoogleSearchIE(),
348         PhotobucketIE(),
349         YahooIE(),
350         YahooSearchIE(),
351         DepositFilesIE(),
352         FacebookIE(),
353         BlipTVIE(),
354         BlipTVUserIE(),
355         VimeoIE(),
356         MyVideoIE(),
357         ComedyCentralIE(),
358         EscapistIE(),
359         CollegeHumorIE(),
360         XVideosIE(),
361         SoundcloudSetIE(),
362         SoundcloudIE(),
363         InfoQIE(),
364         MixcloudIE(),
365         StanfordOpenClassroomIE(),
366         MTVIE(),
367         YoukuIE(),
368         XNXXIE(),
369         YouJizzIE(),
370         PornotubeIE(),
371         YouPornIE(),
372         GooglePlusIE(),
373         ArteTvIE(),
374         NBAIE(),
375         WorldStarHipHopIE(),
376         JustinTVIE(),
377         FunnyOrDieIE(),
378         SteamIE(),
379         UstreamIE(),
380         RBMARadioIE(),
381         EightTracksIE(),
382         KeekIE(),
383         TEDIE(),
384         MySpassIE(),
385         SpiegelIE(),
386         LiveLeakIE(),
387         ARDIE(),
388         ZDFIE(),
389         TumblrIE(),
390         BandcampIE(),
391         RedTubeIE(),
392         InaIE(),
393         HowcastIE(),
394         VineIE(),
395         FlickrIE(),
396         TeamcocoIE(),
397         XHamsterIE(),
398         HypemIE(),
399         Vbox7IE(),
400         GametrailersIE(),
401         StatigramIE(),
402         GenericIE()
403     ]
404
405 def get_info_extractor(ie_name):
406     """Returns the info extractor class with the given ie_name"""
407     return globals()[ie_name+'IE']