2b8e6e6c64810cb4ed5d303f1abebd57516f6718
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.eighttracks import EightTracksIE
29 from .extractor.escapist import EscapistIE
30 from .extractor.facebook import FacebookIE
31 from .extractor.funnyordie import FunnyOrDieIE
32 from .extractor.gametrailers import GametrailersIE
33 from .extractor.generic import GenericIE
34 from .extractor.googleplus import GooglePlusIE
35 from .extractor.googlesearch import GoogleSearchIE
36 from .extractor.infoq import InfoQIE
37 from .extractor.justintv import JustinTVIE
38 from .extractor.keek import KeekIE
39 from .extractor.liveleak import LiveLeakIE
40 from .extractor.metacafe import MetacafeIE
41 from .extractor.mixcloud import MixcloudIE
42 from .extractor.mtv import MTVIE
43 from .extractor.myspass import MySpassIE
44 from .extractor.myvideo import MyVideoIE
45 from .extractor.nba import NBAIE
46 from .extractor.statigram import StatigramIE
47 from .extractor.photobucket import PhotobucketIE
48 from .extractor.pornotube import PornotubeIE
49 from .extractor.rbmaradio import RBMARadioIE
50 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
51 from .extractor.spiegel import SpiegelIE
52 from .extractor.stanfordoc import StanfordOpenClassroomIE
53 from .extractor.steam import SteamIE
54 from .extractor.ted import TEDIE
55 from .extractor.tumblr import TumblrIE
56 from .extractor.ustream import UstreamIE
57 from .extractor.vimeo import VimeoIE
58 from .extractor.worldstarhiphop import WorldStarHipHopIE
59 from .extractor.xnxx import XNXXIE
60 from .extractor.xvideos import XVideosIE
61 from .extractor.yahoo import YahooIE, YahooSearchIE
62 from .extractor.youjizz import YouJizzIE
63 from .extractor.youku import YoukuIE
64 from .extractor.youporn import YouPornIE
65 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
66 from .extractor.zdf import ZDFIE
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101 class BandcampIE(InfoExtractor):
102     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
103
104     def _real_extract(self, url):
105         mobj = re.match(self._VALID_URL, url)
106         title = mobj.group('title')
107         webpage = self._download_webpage(url, title)
108         # We get the link to the free download page
109         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
110         if m_download is None:
111             raise ExtractorError(u'No free songs found')
112
113         download_link = m_download.group(1)
114         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
115                        webpage, re.MULTILINE|re.DOTALL).group('id')
116
117         download_webpage = self._download_webpage(download_link, id,
118                                                   'Downloading free downloads page')
119         # We get the dictionary of the track from some javascrip code
120         info = re.search(r'items: (.*?),$',
121                          download_webpage, re.MULTILINE).group(1)
122         info = json.loads(info)[0]
123         # We pick mp3-320 for now, until format selection can be easily implemented.
124         mp3_info = info[u'downloads'][u'mp3-320']
125         # If we try to use this url it says the link has expired
126         initial_url = mp3_info[u'url']
127         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
128         m_url = re.match(re_url, initial_url)
129         #We build the url we will use to get the final track url
130         # This url is build in Bandcamp in the script download_bunde_*.js
131         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
132         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
133         # If we could correctly generate the .rand field the url would be
134         #in the "download_url" key
135         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
136
137         track_info = {'id':id,
138                       'title' : info[u'title'],
139                       'ext' :   'mp3',
140                       'url' :   final_url,
141                       'thumbnail' : info[u'thumb_url'],
142                       'uploader' :  info[u'artist']
143                       }
144
145         return [track_info]
146
147 class RedTubeIE(InfoExtractor):
148     """Information Extractor for redtube"""
149     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
150
151     def _real_extract(self,url):
152         mobj = re.match(self._VALID_URL, url)
153         if mobj is None:
154             raise ExtractorError(u'Invalid URL: %s' % url)
155
156         video_id = mobj.group('id')
157         video_extension = 'mp4'        
158         webpage = self._download_webpage(url, video_id)
159
160         self.report_extraction(video_id)
161
162         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
163             webpage, u'video URL')
164
165         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
166             webpage, u'title')
167
168         return [{
169             'id':       video_id,
170             'url':      video_url,
171             'ext':      video_extension,
172             'title':    video_title,
173         }]
174         
175 class InaIE(InfoExtractor):
176     """Information Extractor for Ina.fr"""
177     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
178
179     def _real_extract(self,url):
180         mobj = re.match(self._VALID_URL, url)
181
182         video_id = mobj.group('id')
183         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
184         video_extension = 'mp4'
185         webpage = self._download_webpage(mrss_url, video_id)
186
187         self.report_extraction(video_id)
188
189         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
190             webpage, u'video URL')
191
192         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
193             webpage, u'title')
194
195         return [{
196             'id':       video_id,
197             'url':      video_url,
198             'ext':      video_extension,
199             'title':    video_title,
200         }]
201
202 class HowcastIE(InfoExtractor):
203     """Information Extractor for Howcast.com"""
204     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
205
206     def _real_extract(self, url):
207         mobj = re.match(self._VALID_URL, url)
208
209         video_id = mobj.group('id')
210         webpage_url = 'http://www.howcast.com/videos/' + video_id
211         webpage = self._download_webpage(webpage_url, video_id)
212
213         self.report_extraction(video_id)
214
215         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
216             webpage, u'video URL')
217
218         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
219             webpage, u'title')
220
221         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
222             webpage, u'description', fatal=False)
223
224         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
225             webpage, u'thumbnail', fatal=False)
226
227         return [{
228             'id':       video_id,
229             'url':      video_url,
230             'ext':      'mp4',
231             'title':    video_title,
232             'description': video_description,
233             'thumbnail': thumbnail,
234         }]
235
236 class VineIE(InfoExtractor):
237     """Information Extractor for Vine.co"""
238     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
239
240     def _real_extract(self, url):
241         mobj = re.match(self._VALID_URL, url)
242
243         video_id = mobj.group('id')
244         webpage_url = 'https://vine.co/v/' + video_id
245         webpage = self._download_webpage(webpage_url, video_id)
246
247         self.report_extraction(video_id)
248
249         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
250             webpage, u'video URL')
251
252         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
253             webpage, u'title')
254
255         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
256             webpage, u'thumbnail', fatal=False)
257
258         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
259             webpage, u'uploader', fatal=False, flags=re.DOTALL)
260
261         return [{
262             'id':        video_id,
263             'url':       video_url,
264             'ext':       'mp4',
265             'title':     video_title,
266             'thumbnail': thumbnail,
267             'uploader':  uploader,
268         }]
269
270 class FlickrIE(InfoExtractor):
271     """Information Extractor for Flickr videos"""
272     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
273
274     def _real_extract(self, url):
275         mobj = re.match(self._VALID_URL, url)
276
277         video_id = mobj.group('id')
278         video_uploader_id = mobj.group('uploader_id')
279         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
280         webpage = self._download_webpage(webpage_url, video_id)
281
282         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
283
284         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
285         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
286
287         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
288             first_xml, u'node_id')
289
290         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
291         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
292
293         self.report_extraction(video_id)
294
295         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
296         if mobj is None:
297             raise ExtractorError(u'Unable to extract video url')
298         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
299
300         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
301             webpage, u'video title')
302
303         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
304             webpage, u'description', fatal=False)
305
306         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
307             webpage, u'thumbnail', fatal=False)
308
309         return [{
310             'id':          video_id,
311             'url':         video_url,
312             'ext':         'mp4',
313             'title':       video_title,
314             'description': video_description,
315             'thumbnail':   thumbnail,
316             'uploader_id': video_uploader_id,
317         }]
318
319 class TeamcocoIE(InfoExtractor):
320     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
321
322     def _real_extract(self, url):
323         mobj = re.match(self._VALID_URL, url)
324         if mobj is None:
325             raise ExtractorError(u'Invalid URL: %s' % url)
326         url_title = mobj.group('url_title')
327         webpage = self._download_webpage(url, url_title)
328
329         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
330             webpage, u'video id')
331
332         self.report_extraction(video_id)
333
334         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
335             webpage, u'title')
336
337         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
338             webpage, u'thumbnail', fatal=False)
339
340         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
341             webpage, u'description', fatal=False)
342
343         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
344         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
345
346         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
347             data, u'video URL')
348
349         return [{
350             'id':          video_id,
351             'url':         video_url,
352             'ext':         'mp4',
353             'title':       video_title,
354             'thumbnail':   thumbnail,
355             'description': video_description,
356         }]
357
358 class XHamsterIE(InfoExtractor):
359     """Information Extractor for xHamster"""
360     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
361
362     def _real_extract(self,url):
363         mobj = re.match(self._VALID_URL, url)
364
365         video_id = mobj.group('id')
366         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
367         webpage = self._download_webpage(mrss_url, video_id)
368
369         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
370         if mobj is None:
371             raise ExtractorError(u'Unable to extract media URL')
372         if len(mobj.group('server')) == 0:
373             video_url = compat_urllib_parse.unquote(mobj.group('file'))
374         else:
375             video_url = mobj.group('server')+'/key='+mobj.group('file')
376         video_extension = video_url.split('.')[-1]
377
378         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
379             webpage, u'title')
380
381         # Can't see the description anywhere in the UI
382         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
383         #     webpage, u'description', fatal=False)
384         # if video_description: video_description = unescapeHTML(video_description)
385
386         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
387         if mobj:
388             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
389         else:
390             video_upload_date = None
391             self._downloader.report_warning(u'Unable to extract upload date')
392
393         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
394             webpage, u'uploader id', default=u'anonymous')
395
396         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
397             webpage, u'thumbnail', fatal=False)
398
399         return [{
400             'id':       video_id,
401             'url':      video_url,
402             'ext':      video_extension,
403             'title':    video_title,
404             # 'description': video_description,
405             'upload_date': video_upload_date,
406             'uploader_id': video_uploader_id,
407             'thumbnail': video_thumbnail
408         }]
409
410 class HypemIE(InfoExtractor):
411     """Information Extractor for hypem"""
412     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
413
414     def _real_extract(self, url):
415         mobj = re.match(self._VALID_URL, url)
416         if mobj is None:
417             raise ExtractorError(u'Invalid URL: %s' % url)
418         track_id = mobj.group(1)
419
420         data = { 'ax': 1, 'ts': time.time() }
421         data_encoded = compat_urllib_parse.urlencode(data)
422         complete_url = url + "?" + data_encoded
423         request = compat_urllib_request.Request(complete_url)
424         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
425         cookie = urlh.headers.get('Set-Cookie', '')
426
427         self.report_extraction(track_id)
428
429         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
430             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
431         try:
432             track_list = json.loads(html_tracks)
433             track = track_list[u'tracks'][0]
434         except ValueError:
435             raise ExtractorError(u'Hypemachine contained invalid JSON.')
436
437         key = track[u"key"]
438         track_id = track[u"id"]
439         artist = track[u"artist"]
440         title = track[u"song"]
441
442         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
443         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
444         request.add_header('cookie', cookie)
445         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
446         try:
447             song_data = json.loads(song_data_json)
448         except ValueError:
449             raise ExtractorError(u'Hypemachine contained invalid JSON.')
450         final_url = song_data[u"url"]
451
452         return [{
453             'id':       track_id,
454             'url':      final_url,
455             'ext':      "mp3",
456             'title':    title,
457             'artist':   artist,
458         }]
459
460 class Vbox7IE(InfoExtractor):
461     """Information Extractor for Vbox7"""
462     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
463
464     def _real_extract(self,url):
465         mobj = re.match(self._VALID_URL, url)
466         if mobj is None:
467             raise ExtractorError(u'Invalid URL: %s' % url)
468         video_id = mobj.group(1)
469
470         redirect_page, urlh = self._download_webpage_handle(url, video_id)
471         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
472         redirect_url = urlh.geturl() + new_location
473         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
474
475         title = self._html_search_regex(r'<title>(.*)</title>',
476             webpage, u'title').split('/')[0].strip()
477
478         ext = "flv"
479         info_url = "http://vbox7.com/play/magare.do"
480         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
481         info_request = compat_urllib_request.Request(info_url, data)
482         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
483         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
484         if info_response is None:
485             raise ExtractorError(u'Unable to extract the media url')
486         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
487
488         return [{
489             'id':        video_id,
490             'url':       final_url,
491             'ext':       ext,
492             'title':     title,
493             'thumbnail': thumbnail_url,
494         }]
495
496
497 def gen_extractors():
498     """ Return a list of an instance of every supported extractor.
499     The order does matter; the first extractor matched is the one handling the URL.
500     """
501     return [
502         YoutubePlaylistIE(),
503         YoutubeChannelIE(),
504         YoutubeUserIE(),
505         YoutubeSearchIE(),
506         YoutubeIE(),
507         MetacafeIE(),
508         DailymotionIE(),
509         GoogleSearchIE(),
510         PhotobucketIE(),
511         YahooIE(),
512         YahooSearchIE(),
513         DepositFilesIE(),
514         FacebookIE(),
515         BlipTVIE(),
516         BlipTVUserIE(),
517         VimeoIE(),
518         MyVideoIE(),
519         ComedyCentralIE(),
520         EscapistIE(),
521         CollegeHumorIE(),
522         XVideosIE(),
523         SoundcloudSetIE(),
524         SoundcloudIE(),
525         InfoQIE(),
526         MixcloudIE(),
527         StanfordOpenClassroomIE(),
528         MTVIE(),
529         YoukuIE(),
530         XNXXIE(),
531         YouJizzIE(),
532         PornotubeIE(),
533         YouPornIE(),
534         GooglePlusIE(),
535         ArteTvIE(),
536         NBAIE(),
537         WorldStarHipHopIE(),
538         JustinTVIE(),
539         FunnyOrDieIE(),
540         SteamIE(),
541         UstreamIE(),
542         RBMARadioIE(),
543         EightTracksIE(),
544         KeekIE(),
545         TEDIE(),
546         MySpassIE(),
547         SpiegelIE(),
548         LiveLeakIE(),
549         ARDIE(),
550         ZDFIE(),
551         TumblrIE(),
552         BandcampIE(),
553         RedTubeIE(),
554         InaIE(),
555         HowcastIE(),
556         VineIE(),
557         FlickrIE(),
558         TeamcocoIE(),
559         XHamsterIE(),
560         HypemIE(),
561         Vbox7IE(),
562         GametrailersIE(),
563         StatigramIE(),
564         GenericIE()
565     ]
566
567 def get_info_extractor(ie_name):
568     """Returns the info extractor class with the given ie_name"""
569     return globals()[ie_name+'IE']