[LiveLeak] move into own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.eighttracks import EightTracksIE
29 from .extractor.escapist import EscapistIE
30 from .extractor.facebook import FacebookIE
31 from .extractor.funnyordie import FunnyOrDieIE
32 from .extractor.gametrailers import GametrailersIE
33 from .extractor.generic import GenericIE
34 from .extractor.googleplus import GooglePlusIE
35 from .extractor.googlesearch import GoogleSearchIE
36 from .extractor.infoq import InfoQIE
37 from .extractor.justintv import JustinTVIE
38 from .extractor.keek import KeekIE
39 from .extractor.liveleak import LiveLeakIE
40 from .extractor.metacafe import MetacafeIE
41 from .extractor.mixcloud import MixcloudIE
42 from .extractor.mtv import MTVIE
43 from .extractor.myspass import MySpassIE
44 from .extractor.myvideo import MyVideoIE
45 from .extractor.nba import NBAIE
46 from .extractor.statigram import StatigramIE
47 from .extractor.photobucket import PhotobucketIE
48 from .extractor.pornotube import PornotubeIE
49 from .extractor.rbmaradio import RBMARadioIE
50 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
51 from .extractor.spiegel import SpiegelIE
52 from .extractor.stanfordoc import StanfordOpenClassroomIE
53 from .extractor.steam import SteamIE
54 from .extractor.ted import TEDIE
55 from .extractor.ustream import UstreamIE
56 from .extractor.vimeo import VimeoIE
57 from .extractor.worldstarhiphop import WorldStarHipHopIE
58 from .extractor.xnxx import XNXXIE
59 from .extractor.xvideos import XVideosIE
60 from .extractor.yahoo import YahooIE, YahooSearchIE
61 from .extractor.youjizz import YouJizzIE
62 from .extractor.youku import YoukuIE
63 from .extractor.youporn import YouPornIE
64 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
65 from .extractor.zdf import ZDFIE
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99 class TumblrIE(InfoExtractor):
100     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
101
102     def _real_extract(self, url):
103         m_url = re.match(self._VALID_URL, url)
104         video_id = m_url.group('id')
105         blog = m_url.group('blog_name')
106
107         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
108         webpage = self._download_webpage(url, video_id)
109
110         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
111         video = re.search(re_video, webpage)
112         if video is None:
113            raise ExtractorError(u'Unable to extract video')
114         video_url = video.group('video_url')
115         ext = video.group('ext')
116
117         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
118             webpage, u'thumbnail', fatal=False)  # We pick the first poster
119         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
120
121         # The only place where you can get a title, it's not complete,
122         # but searching in other places doesn't work for all videos
123         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
124             webpage, u'title', flags=re.DOTALL)
125
126         return [{'id': video_id,
127                  'url': video_url,
128                  'title': video_title,
129                  'thumbnail': video_thumbnail,
130                  'ext': ext
131                  }]
132
133 class BandcampIE(InfoExtractor):
134     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
135
136     def _real_extract(self, url):
137         mobj = re.match(self._VALID_URL, url)
138         title = mobj.group('title')
139         webpage = self._download_webpage(url, title)
140         # We get the link to the free download page
141         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
142         if m_download is None:
143             raise ExtractorError(u'No free songs found')
144
145         download_link = m_download.group(1)
146         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
147                        webpage, re.MULTILINE|re.DOTALL).group('id')
148
149         download_webpage = self._download_webpage(download_link, id,
150                                                   'Downloading free downloads page')
151         # We get the dictionary of the track from some javascrip code
152         info = re.search(r'items: (.*?),$',
153                          download_webpage, re.MULTILINE).group(1)
154         info = json.loads(info)[0]
155         # We pick mp3-320 for now, until format selection can be easily implemented.
156         mp3_info = info[u'downloads'][u'mp3-320']
157         # If we try to use this url it says the link has expired
158         initial_url = mp3_info[u'url']
159         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
160         m_url = re.match(re_url, initial_url)
161         #We build the url we will use to get the final track url
162         # This url is build in Bandcamp in the script download_bunde_*.js
163         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
164         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
165         # If we could correctly generate the .rand field the url would be
166         #in the "download_url" key
167         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
168
169         track_info = {'id':id,
170                       'title' : info[u'title'],
171                       'ext' :   'mp3',
172                       'url' :   final_url,
173                       'thumbnail' : info[u'thumb_url'],
174                       'uploader' :  info[u'artist']
175                       }
176
177         return [track_info]
178
179 class RedTubeIE(InfoExtractor):
180     """Information Extractor for redtube"""
181     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
182
183     def _real_extract(self,url):
184         mobj = re.match(self._VALID_URL, url)
185         if mobj is None:
186             raise ExtractorError(u'Invalid URL: %s' % url)
187
188         video_id = mobj.group('id')
189         video_extension = 'mp4'        
190         webpage = self._download_webpage(url, video_id)
191
192         self.report_extraction(video_id)
193
194         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
195             webpage, u'video URL')
196
197         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
198             webpage, u'title')
199
200         return [{
201             'id':       video_id,
202             'url':      video_url,
203             'ext':      video_extension,
204             'title':    video_title,
205         }]
206         
207 class InaIE(InfoExtractor):
208     """Information Extractor for Ina.fr"""
209     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
210
211     def _real_extract(self,url):
212         mobj = re.match(self._VALID_URL, url)
213
214         video_id = mobj.group('id')
215         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
216         video_extension = 'mp4'
217         webpage = self._download_webpage(mrss_url, video_id)
218
219         self.report_extraction(video_id)
220
221         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
222             webpage, u'video URL')
223
224         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
225             webpage, u'title')
226
227         return [{
228             'id':       video_id,
229             'url':      video_url,
230             'ext':      video_extension,
231             'title':    video_title,
232         }]
233
234 class HowcastIE(InfoExtractor):
235     """Information Extractor for Howcast.com"""
236     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
237
238     def _real_extract(self, url):
239         mobj = re.match(self._VALID_URL, url)
240
241         video_id = mobj.group('id')
242         webpage_url = 'http://www.howcast.com/videos/' + video_id
243         webpage = self._download_webpage(webpage_url, video_id)
244
245         self.report_extraction(video_id)
246
247         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
248             webpage, u'video URL')
249
250         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
251             webpage, u'title')
252
253         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
254             webpage, u'description', fatal=False)
255
256         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
257             webpage, u'thumbnail', fatal=False)
258
259         return [{
260             'id':       video_id,
261             'url':      video_url,
262             'ext':      'mp4',
263             'title':    video_title,
264             'description': video_description,
265             'thumbnail': thumbnail,
266         }]
267
268 class VineIE(InfoExtractor):
269     """Information Extractor for Vine.co"""
270     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
271
272     def _real_extract(self, url):
273         mobj = re.match(self._VALID_URL, url)
274
275         video_id = mobj.group('id')
276         webpage_url = 'https://vine.co/v/' + video_id
277         webpage = self._download_webpage(webpage_url, video_id)
278
279         self.report_extraction(video_id)
280
281         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
282             webpage, u'video URL')
283
284         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
285             webpage, u'title')
286
287         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
288             webpage, u'thumbnail', fatal=False)
289
290         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
291             webpage, u'uploader', fatal=False, flags=re.DOTALL)
292
293         return [{
294             'id':        video_id,
295             'url':       video_url,
296             'ext':       'mp4',
297             'title':     video_title,
298             'thumbnail': thumbnail,
299             'uploader':  uploader,
300         }]
301
302 class FlickrIE(InfoExtractor):
303     """Information Extractor for Flickr videos"""
304     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
305
306     def _real_extract(self, url):
307         mobj = re.match(self._VALID_URL, url)
308
309         video_id = mobj.group('id')
310         video_uploader_id = mobj.group('uploader_id')
311         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
312         webpage = self._download_webpage(webpage_url, video_id)
313
314         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
315
316         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
317         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
318
319         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
320             first_xml, u'node_id')
321
322         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
323         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
324
325         self.report_extraction(video_id)
326
327         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
328         if mobj is None:
329             raise ExtractorError(u'Unable to extract video url')
330         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
331
332         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
333             webpage, u'video title')
334
335         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
336             webpage, u'description', fatal=False)
337
338         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
339             webpage, u'thumbnail', fatal=False)
340
341         return [{
342             'id':          video_id,
343             'url':         video_url,
344             'ext':         'mp4',
345             'title':       video_title,
346             'description': video_description,
347             'thumbnail':   thumbnail,
348             'uploader_id': video_uploader_id,
349         }]
350
351 class TeamcocoIE(InfoExtractor):
352     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
353
354     def _real_extract(self, url):
355         mobj = re.match(self._VALID_URL, url)
356         if mobj is None:
357             raise ExtractorError(u'Invalid URL: %s' % url)
358         url_title = mobj.group('url_title')
359         webpage = self._download_webpage(url, url_title)
360
361         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
362             webpage, u'video id')
363
364         self.report_extraction(video_id)
365
366         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
367             webpage, u'title')
368
369         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
370             webpage, u'thumbnail', fatal=False)
371
372         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
373             webpage, u'description', fatal=False)
374
375         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
376         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
377
378         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
379             data, u'video URL')
380
381         return [{
382             'id':          video_id,
383             'url':         video_url,
384             'ext':         'mp4',
385             'title':       video_title,
386             'thumbnail':   thumbnail,
387             'description': video_description,
388         }]
389
390 class XHamsterIE(InfoExtractor):
391     """Information Extractor for xHamster"""
392     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
393
394     def _real_extract(self,url):
395         mobj = re.match(self._VALID_URL, url)
396
397         video_id = mobj.group('id')
398         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
399         webpage = self._download_webpage(mrss_url, video_id)
400
401         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
402         if mobj is None:
403             raise ExtractorError(u'Unable to extract media URL')
404         if len(mobj.group('server')) == 0:
405             video_url = compat_urllib_parse.unquote(mobj.group('file'))
406         else:
407             video_url = mobj.group('server')+'/key='+mobj.group('file')
408         video_extension = video_url.split('.')[-1]
409
410         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
411             webpage, u'title')
412
413         # Can't see the description anywhere in the UI
414         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
415         #     webpage, u'description', fatal=False)
416         # if video_description: video_description = unescapeHTML(video_description)
417
418         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
419         if mobj:
420             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
421         else:
422             video_upload_date = None
423             self._downloader.report_warning(u'Unable to extract upload date')
424
425         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
426             webpage, u'uploader id', default=u'anonymous')
427
428         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
429             webpage, u'thumbnail', fatal=False)
430
431         return [{
432             'id':       video_id,
433             'url':      video_url,
434             'ext':      video_extension,
435             'title':    video_title,
436             # 'description': video_description,
437             'upload_date': video_upload_date,
438             'uploader_id': video_uploader_id,
439             'thumbnail': video_thumbnail
440         }]
441
442 class HypemIE(InfoExtractor):
443     """Information Extractor for hypem"""
444     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
445
446     def _real_extract(self, url):
447         mobj = re.match(self._VALID_URL, url)
448         if mobj is None:
449             raise ExtractorError(u'Invalid URL: %s' % url)
450         track_id = mobj.group(1)
451
452         data = { 'ax': 1, 'ts': time.time() }
453         data_encoded = compat_urllib_parse.urlencode(data)
454         complete_url = url + "?" + data_encoded
455         request = compat_urllib_request.Request(complete_url)
456         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
457         cookie = urlh.headers.get('Set-Cookie', '')
458
459         self.report_extraction(track_id)
460
461         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
462             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
463         try:
464             track_list = json.loads(html_tracks)
465             track = track_list[u'tracks'][0]
466         except ValueError:
467             raise ExtractorError(u'Hypemachine contained invalid JSON.')
468
469         key = track[u"key"]
470         track_id = track[u"id"]
471         artist = track[u"artist"]
472         title = track[u"song"]
473
474         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
475         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
476         request.add_header('cookie', cookie)
477         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
478         try:
479             song_data = json.loads(song_data_json)
480         except ValueError:
481             raise ExtractorError(u'Hypemachine contained invalid JSON.')
482         final_url = song_data[u"url"]
483
484         return [{
485             'id':       track_id,
486             'url':      final_url,
487             'ext':      "mp3",
488             'title':    title,
489             'artist':   artist,
490         }]
491
492 class Vbox7IE(InfoExtractor):
493     """Information Extractor for Vbox7"""
494     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
495
496     def _real_extract(self,url):
497         mobj = re.match(self._VALID_URL, url)
498         if mobj is None:
499             raise ExtractorError(u'Invalid URL: %s' % url)
500         video_id = mobj.group(1)
501
502         redirect_page, urlh = self._download_webpage_handle(url, video_id)
503         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
504         redirect_url = urlh.geturl() + new_location
505         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
506
507         title = self._html_search_regex(r'<title>(.*)</title>',
508             webpage, u'title').split('/')[0].strip()
509
510         ext = "flv"
511         info_url = "http://vbox7.com/play/magare.do"
512         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
513         info_request = compat_urllib_request.Request(info_url, data)
514         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
515         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
516         if info_response is None:
517             raise ExtractorError(u'Unable to extract the media url')
518         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
519
520         return [{
521             'id':        video_id,
522             'url':       final_url,
523             'ext':       ext,
524             'title':     title,
525             'thumbnail': thumbnail_url,
526         }]
527
528
529 def gen_extractors():
530     """ Return a list of an instance of every supported extractor.
531     The order does matter; the first extractor matched is the one handling the URL.
532     """
533     return [
534         YoutubePlaylistIE(),
535         YoutubeChannelIE(),
536         YoutubeUserIE(),
537         YoutubeSearchIE(),
538         YoutubeIE(),
539         MetacafeIE(),
540         DailymotionIE(),
541         GoogleSearchIE(),
542         PhotobucketIE(),
543         YahooIE(),
544         YahooSearchIE(),
545         DepositFilesIE(),
546         FacebookIE(),
547         BlipTVIE(),
548         BlipTVUserIE(),
549         VimeoIE(),
550         MyVideoIE(),
551         ComedyCentralIE(),
552         EscapistIE(),
553         CollegeHumorIE(),
554         XVideosIE(),
555         SoundcloudSetIE(),
556         SoundcloudIE(),
557         InfoQIE(),
558         MixcloudIE(),
559         StanfordOpenClassroomIE(),
560         MTVIE(),
561         YoukuIE(),
562         XNXXIE(),
563         YouJizzIE(),
564         PornotubeIE(),
565         YouPornIE(),
566         GooglePlusIE(),
567         ArteTvIE(),
568         NBAIE(),
569         WorldStarHipHopIE(),
570         JustinTVIE(),
571         FunnyOrDieIE(),
572         SteamIE(),
573         UstreamIE(),
574         RBMARadioIE(),
575         EightTracksIE(),
576         KeekIE(),
577         TEDIE(),
578         MySpassIE(),
579         SpiegelIE(),
580         LiveLeakIE(),
581         ARDIE(),
582         ZDFIE(),
583         TumblrIE(),
584         BandcampIE(),
585         RedTubeIE(),
586         InaIE(),
587         HowcastIE(),
588         VineIE(),
589         FlickrIE(),
590         TeamcocoIE(),
591         XHamsterIE(),
592         HypemIE(),
593         Vbox7IE(),
594         GametrailersIE(),
595         StatigramIE(),
596         GenericIE()
597     ]
598
599 def get_info_extractor(ie_name):
600     """Returns the info extractor class with the given ie_name"""
601     return globals()[ie_name+'IE']