[Bandcamp] move into own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.infoq import InfoQIE
38 from .extractor.justintv import JustinTVIE
39 from .extractor.keek import KeekIE
40 from .extractor.liveleak import LiveLeakIE
41 from .extractor.metacafe import MetacafeIE
42 from .extractor.mixcloud import MixcloudIE
43 from .extractor.mtv import MTVIE
44 from .extractor.myspass import MySpassIE
45 from .extractor.myvideo import MyVideoIE
46 from .extractor.nba import NBAIE
47 from .extractor.statigram import StatigramIE
48 from .extractor.photobucket import PhotobucketIE
49 from .extractor.pornotube import PornotubeIE
50 from .extractor.rbmaradio import RBMARadioIE
51 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
52 from .extractor.spiegel import SpiegelIE
53 from .extractor.stanfordoc import StanfordOpenClassroomIE
54 from .extractor.steam import SteamIE
55 from .extractor.ted import TEDIE
56 from .extractor.tumblr import TumblrIE
57 from .extractor.ustream import UstreamIE
58 from .extractor.vimeo import VimeoIE
59 from .extractor.worldstarhiphop import WorldStarHipHopIE
60 from .extractor.xnxx import XNXXIE
61 from .extractor.xvideos import XVideosIE
62 from .extractor.yahoo import YahooIE, YahooSearchIE
63 from .extractor.youjizz import YouJizzIE
64 from .extractor.youku import YoukuIE
65 from .extractor.youporn import YouPornIE
66 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
67 from .extractor.zdf import ZDFIE
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103 class RedTubeIE(InfoExtractor):
104     """Information Extractor for redtube"""
105     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
106
107     def _real_extract(self,url):
108         mobj = re.match(self._VALID_URL, url)
109         if mobj is None:
110             raise ExtractorError(u'Invalid URL: %s' % url)
111
112         video_id = mobj.group('id')
113         video_extension = 'mp4'        
114         webpage = self._download_webpage(url, video_id)
115
116         self.report_extraction(video_id)
117
118         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
119             webpage, u'video URL')
120
121         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
122             webpage, u'title')
123
124         return [{
125             'id':       video_id,
126             'url':      video_url,
127             'ext':      video_extension,
128             'title':    video_title,
129         }]
130         
131 class InaIE(InfoExtractor):
132     """Information Extractor for Ina.fr"""
133     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
134
135     def _real_extract(self,url):
136         mobj = re.match(self._VALID_URL, url)
137
138         video_id = mobj.group('id')
139         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
140         video_extension = 'mp4'
141         webpage = self._download_webpage(mrss_url, video_id)
142
143         self.report_extraction(video_id)
144
145         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
146             webpage, u'video URL')
147
148         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
149             webpage, u'title')
150
151         return [{
152             'id':       video_id,
153             'url':      video_url,
154             'ext':      video_extension,
155             'title':    video_title,
156         }]
157
158 class HowcastIE(InfoExtractor):
159     """Information Extractor for Howcast.com"""
160     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
161
162     def _real_extract(self, url):
163         mobj = re.match(self._VALID_URL, url)
164
165         video_id = mobj.group('id')
166         webpage_url = 'http://www.howcast.com/videos/' + video_id
167         webpage = self._download_webpage(webpage_url, video_id)
168
169         self.report_extraction(video_id)
170
171         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
172             webpage, u'video URL')
173
174         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
175             webpage, u'title')
176
177         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
178             webpage, u'description', fatal=False)
179
180         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
181             webpage, u'thumbnail', fatal=False)
182
183         return [{
184             'id':       video_id,
185             'url':      video_url,
186             'ext':      'mp4',
187             'title':    video_title,
188             'description': video_description,
189             'thumbnail': thumbnail,
190         }]
191
192 class VineIE(InfoExtractor):
193     """Information Extractor for Vine.co"""
194     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
195
196     def _real_extract(self, url):
197         mobj = re.match(self._VALID_URL, url)
198
199         video_id = mobj.group('id')
200         webpage_url = 'https://vine.co/v/' + video_id
201         webpage = self._download_webpage(webpage_url, video_id)
202
203         self.report_extraction(video_id)
204
205         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
206             webpage, u'video URL')
207
208         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
209             webpage, u'title')
210
211         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
212             webpage, u'thumbnail', fatal=False)
213
214         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
215             webpage, u'uploader', fatal=False, flags=re.DOTALL)
216
217         return [{
218             'id':        video_id,
219             'url':       video_url,
220             'ext':       'mp4',
221             'title':     video_title,
222             'thumbnail': thumbnail,
223             'uploader':  uploader,
224         }]
225
226 class FlickrIE(InfoExtractor):
227     """Information Extractor for Flickr videos"""
228     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
229
230     def _real_extract(self, url):
231         mobj = re.match(self._VALID_URL, url)
232
233         video_id = mobj.group('id')
234         video_uploader_id = mobj.group('uploader_id')
235         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
236         webpage = self._download_webpage(webpage_url, video_id)
237
238         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
239
240         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
241         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
242
243         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
244             first_xml, u'node_id')
245
246         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
247         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
248
249         self.report_extraction(video_id)
250
251         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
252         if mobj is None:
253             raise ExtractorError(u'Unable to extract video url')
254         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
255
256         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
257             webpage, u'video title')
258
259         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
260             webpage, u'description', fatal=False)
261
262         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
263             webpage, u'thumbnail', fatal=False)
264
265         return [{
266             'id':          video_id,
267             'url':         video_url,
268             'ext':         'mp4',
269             'title':       video_title,
270             'description': video_description,
271             'thumbnail':   thumbnail,
272             'uploader_id': video_uploader_id,
273         }]
274
275 class TeamcocoIE(InfoExtractor):
276     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
277
278     def _real_extract(self, url):
279         mobj = re.match(self._VALID_URL, url)
280         if mobj is None:
281             raise ExtractorError(u'Invalid URL: %s' % url)
282         url_title = mobj.group('url_title')
283         webpage = self._download_webpage(url, url_title)
284
285         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
286             webpage, u'video id')
287
288         self.report_extraction(video_id)
289
290         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
291             webpage, u'title')
292
293         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
294             webpage, u'thumbnail', fatal=False)
295
296         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
297             webpage, u'description', fatal=False)
298
299         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
300         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
301
302         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
303             data, u'video URL')
304
305         return [{
306             'id':          video_id,
307             'url':         video_url,
308             'ext':         'mp4',
309             'title':       video_title,
310             'thumbnail':   thumbnail,
311             'description': video_description,
312         }]
313
314 class XHamsterIE(InfoExtractor):
315     """Information Extractor for xHamster"""
316     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
317
318     def _real_extract(self,url):
319         mobj = re.match(self._VALID_URL, url)
320
321         video_id = mobj.group('id')
322         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
323         webpage = self._download_webpage(mrss_url, video_id)
324
325         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
326         if mobj is None:
327             raise ExtractorError(u'Unable to extract media URL')
328         if len(mobj.group('server')) == 0:
329             video_url = compat_urllib_parse.unquote(mobj.group('file'))
330         else:
331             video_url = mobj.group('server')+'/key='+mobj.group('file')
332         video_extension = video_url.split('.')[-1]
333
334         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
335             webpage, u'title')
336
337         # Can't see the description anywhere in the UI
338         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
339         #     webpage, u'description', fatal=False)
340         # if video_description: video_description = unescapeHTML(video_description)
341
342         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
343         if mobj:
344             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
345         else:
346             video_upload_date = None
347             self._downloader.report_warning(u'Unable to extract upload date')
348
349         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
350             webpage, u'uploader id', default=u'anonymous')
351
352         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
353             webpage, u'thumbnail', fatal=False)
354
355         return [{
356             'id':       video_id,
357             'url':      video_url,
358             'ext':      video_extension,
359             'title':    video_title,
360             # 'description': video_description,
361             'upload_date': video_upload_date,
362             'uploader_id': video_uploader_id,
363             'thumbnail': video_thumbnail
364         }]
365
366 class HypemIE(InfoExtractor):
367     """Information Extractor for hypem"""
368     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
369
370     def _real_extract(self, url):
371         mobj = re.match(self._VALID_URL, url)
372         if mobj is None:
373             raise ExtractorError(u'Invalid URL: %s' % url)
374         track_id = mobj.group(1)
375
376         data = { 'ax': 1, 'ts': time.time() }
377         data_encoded = compat_urllib_parse.urlencode(data)
378         complete_url = url + "?" + data_encoded
379         request = compat_urllib_request.Request(complete_url)
380         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
381         cookie = urlh.headers.get('Set-Cookie', '')
382
383         self.report_extraction(track_id)
384
385         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
386             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
387         try:
388             track_list = json.loads(html_tracks)
389             track = track_list[u'tracks'][0]
390         except ValueError:
391             raise ExtractorError(u'Hypemachine contained invalid JSON.')
392
393         key = track[u"key"]
394         track_id = track[u"id"]
395         artist = track[u"artist"]
396         title = track[u"song"]
397
398         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
399         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
400         request.add_header('cookie', cookie)
401         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
402         try:
403             song_data = json.loads(song_data_json)
404         except ValueError:
405             raise ExtractorError(u'Hypemachine contained invalid JSON.')
406         final_url = song_data[u"url"]
407
408         return [{
409             'id':       track_id,
410             'url':      final_url,
411             'ext':      "mp3",
412             'title':    title,
413             'artist':   artist,
414         }]
415
416 class Vbox7IE(InfoExtractor):
417     """Information Extractor for Vbox7"""
418     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
419
420     def _real_extract(self,url):
421         mobj = re.match(self._VALID_URL, url)
422         if mobj is None:
423             raise ExtractorError(u'Invalid URL: %s' % url)
424         video_id = mobj.group(1)
425
426         redirect_page, urlh = self._download_webpage_handle(url, video_id)
427         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
428         redirect_url = urlh.geturl() + new_location
429         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
430
431         title = self._html_search_regex(r'<title>(.*)</title>',
432             webpage, u'title').split('/')[0].strip()
433
434         ext = "flv"
435         info_url = "http://vbox7.com/play/magare.do"
436         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
437         info_request = compat_urllib_request.Request(info_url, data)
438         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
439         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
440         if info_response is None:
441             raise ExtractorError(u'Unable to extract the media url')
442         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
443
444         return [{
445             'id':        video_id,
446             'url':       final_url,
447             'ext':       ext,
448             'title':     title,
449             'thumbnail': thumbnail_url,
450         }]
451
452
453 def gen_extractors():
454     """ Return a list of an instance of every supported extractor.
455     The order does matter; the first extractor matched is the one handling the URL.
456     """
457     return [
458         YoutubePlaylistIE(),
459         YoutubeChannelIE(),
460         YoutubeUserIE(),
461         YoutubeSearchIE(),
462         YoutubeIE(),
463         MetacafeIE(),
464         DailymotionIE(),
465         GoogleSearchIE(),
466         PhotobucketIE(),
467         YahooIE(),
468         YahooSearchIE(),
469         DepositFilesIE(),
470         FacebookIE(),
471         BlipTVIE(),
472         BlipTVUserIE(),
473         VimeoIE(),
474         MyVideoIE(),
475         ComedyCentralIE(),
476         EscapistIE(),
477         CollegeHumorIE(),
478         XVideosIE(),
479         SoundcloudSetIE(),
480         SoundcloudIE(),
481         InfoQIE(),
482         MixcloudIE(),
483         StanfordOpenClassroomIE(),
484         MTVIE(),
485         YoukuIE(),
486         XNXXIE(),
487         YouJizzIE(),
488         PornotubeIE(),
489         YouPornIE(),
490         GooglePlusIE(),
491         ArteTvIE(),
492         NBAIE(),
493         WorldStarHipHopIE(),
494         JustinTVIE(),
495         FunnyOrDieIE(),
496         SteamIE(),
497         UstreamIE(),
498         RBMARadioIE(),
499         EightTracksIE(),
500         KeekIE(),
501         TEDIE(),
502         MySpassIE(),
503         SpiegelIE(),
504         LiveLeakIE(),
505         ARDIE(),
506         ZDFIE(),
507         TumblrIE(),
508         BandcampIE(),
509         RedTubeIE(),
510         InaIE(),
511         HowcastIE(),
512         VineIE(),
513         FlickrIE(),
514         TeamcocoIE(),
515         XHamsterIE(),
516         HypemIE(),
517         Vbox7IE(),
518         GametrailersIE(),
519         StatigramIE(),
520         GenericIE()
521     ]
522
523 def get_info_extractor(ie_name):
524     """Returns the info extractor class with the given ie_name"""
525     return globals()[ie_name+'IE']