[youjizz] move into own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.funnyordie import FunnyOrDieIE
31 from .extractor.gametrailers import GametrailersIE
32 from .extractor.generic import GenericIE
33 from .extractor.googleplus import GooglePlusIE
34 from .extractor.googlesearch import GoogleSearchIE
35 from .extractor.infoq import InfoQIE
36 from .extractor.justintv import JustinTVIE
37 from .extractor.metacafe import MetacafeIE
38 from .extractor.mixcloud import MixcloudIE
39 from .extractor.mtv import MTVIE
40 from .extractor.myvideo import MyVideoIE
41 from .extractor.nba import NBAIE
42 from .extractor.statigram import StatigramIE
43 from .extractor.photobucket import PhotobucketIE
44 from .extractor.pornotube import PornotubeIE
45 from .extractor.rbmaradio import RBMARadioIE
46 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
47 from .extractor.stanfordoc import StanfordOpenClassroomIE
48 from .extractor.steam import SteamIE
49 from .extractor.ted import TEDIE
50 from .extractor.ustream import UstreamIE
51 from .extractor.vimeo import VimeoIE
52 from .extractor.worldstarhiphop import WorldStarHipHopIE
53 from .extractor.xnxx import XNXXIE
54 from .extractor.xvideos import XVideosIE
55 from .extractor.yahoo import YahooIE, YahooSearchIE
56 from .extractor.youjizz import YouJizzIE
57 from .extractor.youku import YoukuIE
58 from .extractor.youporn import YouPornIE
59 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
60 from .extractor.zdf import ZDFIE
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86 class EightTracksIE(InfoExtractor):
87     IE_NAME = '8tracks'
88     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
89
90     def _real_extract(self, url):
91         mobj = re.match(self._VALID_URL, url)
92         if mobj is None:
93             raise ExtractorError(u'Invalid URL: %s' % url)
94         playlist_id = mobj.group('id')
95
96         webpage = self._download_webpage(url, playlist_id)
97
98         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
99         data = json.loads(json_like)
100
101         session = str(random.randint(0, 1000000000))
102         mix_id = data['id']
103         track_count = data['tracks_count']
104         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
105         next_url = first_url
106         res = []
107         for i in itertools.count():
108             api_json = self._download_webpage(next_url, playlist_id,
109                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
110                 errnote=u'Failed to download song information')
111             api_data = json.loads(api_json)
112             track_data = api_data[u'set']['track']
113             info = {
114                 'id': track_data['id'],
115                 'url': track_data['track_file_stream_url'],
116                 'title': track_data['performer'] + u' - ' + track_data['name'],
117                 'raw_title': track_data['name'],
118                 'uploader_id': data['user']['login'],
119                 'ext': 'm4a',
120             }
121             res.append(info)
122             if api_data['set']['at_last_track']:
123                 break
124             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
125         return res
126
127 class KeekIE(InfoExtractor):
128     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
129     IE_NAME = u'keek'
130
131     def _real_extract(self, url):
132         m = re.match(self._VALID_URL, url)
133         video_id = m.group('videoID')
134
135         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
136         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
137         webpage = self._download_webpage(url, video_id)
138
139         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
140             webpage, u'title')
141
142         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
143             webpage, u'uploader', fatal=False)
144
145         info = {
146                 'id': video_id,
147                 'url': video_url,
148                 'ext': 'mp4',
149                 'title': video_title,
150                 'thumbnail': thumbnail,
151                 'uploader': uploader
152         }
153         return [info]
154
155
156 class MySpassIE(InfoExtractor):
157     _VALID_URL = r'http://www.myspass.de/.*'
158
159     def _real_extract(self, url):
160         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
161
162         # video id is the last path element of the URL
163         # usually there is a trailing slash, so also try the second but last
164         url_path = compat_urllib_parse_urlparse(url).path
165         url_parent_path, video_id = os.path.split(url_path)
166         if not video_id:
167             _, video_id = os.path.split(url_parent_path)
168
169         # get metadata
170         metadata_url = META_DATA_URL_TEMPLATE % video_id
171         metadata_text = self._download_webpage(metadata_url, video_id)
172         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
173
174         # extract values from metadata
175         url_flv_el = metadata.find('url_flv')
176         if url_flv_el is None:
177             raise ExtractorError(u'Unable to extract download url')
178         video_url = url_flv_el.text
179         extension = os.path.splitext(video_url)[1][1:]
180         title_el = metadata.find('title')
181         if title_el is None:
182             raise ExtractorError(u'Unable to extract title')
183         title = title_el.text
184         format_id_el = metadata.find('format_id')
185         if format_id_el is None:
186             format = ext
187         else:
188             format = format_id_el.text
189         description_el = metadata.find('description')
190         if description_el is not None:
191             description = description_el.text
192         else:
193             description = None
194         imagePreview_el = metadata.find('imagePreview')
195         if imagePreview_el is not None:
196             thumbnail = imagePreview_el.text
197         else:
198             thumbnail = None
199         info = {
200             'id': video_id,
201             'url': video_url,
202             'title': title,
203             'ext': extension,
204             'format': format,
205             'thumbnail': thumbnail,
206             'description': description
207         }
208         return [info]
209
210 class SpiegelIE(InfoExtractor):
211     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
212
213     def _real_extract(self, url):
214         m = re.match(self._VALID_URL, url)
215         video_id = m.group('videoID')
216
217         webpage = self._download_webpage(url, video_id)
218
219         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
220             webpage, u'title')
221
222         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
223         xml_code = self._download_webpage(xml_url, video_id,
224                     note=u'Downloading XML', errnote=u'Failed to download XML')
225
226         idoc = xml.etree.ElementTree.fromstring(xml_code)
227         last_type = idoc[-1]
228         filename = last_type.findall('./filename')[0].text
229         duration = float(last_type.findall('./duration')[0].text)
230
231         video_url = 'http://video2.spiegel.de/flash/' + filename
232         video_ext = filename.rpartition('.')[2]
233         info = {
234             'id': video_id,
235             'url': video_url,
236             'ext': video_ext,
237             'title': video_title,
238             'duration': duration,
239         }
240         return [info]
241
242 class LiveLeakIE(InfoExtractor):
243
244     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
245     IE_NAME = u'liveleak'
246
247     def _real_extract(self, url):
248         mobj = re.match(self._VALID_URL, url)
249         if mobj is None:
250             raise ExtractorError(u'Invalid URL: %s' % url)
251
252         video_id = mobj.group('video_id')
253
254         webpage = self._download_webpage(url, video_id)
255
256         video_url = self._search_regex(r'file: "(.*?)",',
257             webpage, u'video URL')
258
259         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
260             webpage, u'title').replace('LiveLeak.com -', '').strip()
261
262         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
263             webpage, u'description', fatal=False)
264
265         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
266             webpage, u'uploader', fatal=False)
267
268         info = {
269             'id':  video_id,
270             'url': video_url,
271             'ext': 'mp4',
272             'title': video_title,
273             'description': video_description,
274             'uploader': video_uploader
275         }
276
277         return [info]
278
279
280
281 class TumblrIE(InfoExtractor):
282     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
283
284     def _real_extract(self, url):
285         m_url = re.match(self._VALID_URL, url)
286         video_id = m_url.group('id')
287         blog = m_url.group('blog_name')
288
289         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
290         webpage = self._download_webpage(url, video_id)
291
292         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
293         video = re.search(re_video, webpage)
294         if video is None:
295            raise ExtractorError(u'Unable to extract video')
296         video_url = video.group('video_url')
297         ext = video.group('ext')
298
299         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
300             webpage, u'thumbnail', fatal=False)  # We pick the first poster
301         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
302
303         # The only place where you can get a title, it's not complete,
304         # but searching in other places doesn't work for all videos
305         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
306             webpage, u'title', flags=re.DOTALL)
307
308         return [{'id': video_id,
309                  'url': video_url,
310                  'title': video_title,
311                  'thumbnail': video_thumbnail,
312                  'ext': ext
313                  }]
314
315 class BandcampIE(InfoExtractor):
316     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
317
318     def _real_extract(self, url):
319         mobj = re.match(self._VALID_URL, url)
320         title = mobj.group('title')
321         webpage = self._download_webpage(url, title)
322         # We get the link to the free download page
323         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
324         if m_download is None:
325             raise ExtractorError(u'No free songs found')
326
327         download_link = m_download.group(1)
328         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
329                        webpage, re.MULTILINE|re.DOTALL).group('id')
330
331         download_webpage = self._download_webpage(download_link, id,
332                                                   'Downloading free downloads page')
333         # We get the dictionary of the track from some javascrip code
334         info = re.search(r'items: (.*?),$',
335                          download_webpage, re.MULTILINE).group(1)
336         info = json.loads(info)[0]
337         # We pick mp3-320 for now, until format selection can be easily implemented.
338         mp3_info = info[u'downloads'][u'mp3-320']
339         # If we try to use this url it says the link has expired
340         initial_url = mp3_info[u'url']
341         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
342         m_url = re.match(re_url, initial_url)
343         #We build the url we will use to get the final track url
344         # This url is build in Bandcamp in the script download_bunde_*.js
345         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
346         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
347         # If we could correctly generate the .rand field the url would be
348         #in the "download_url" key
349         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
350
351         track_info = {'id':id,
352                       'title' : info[u'title'],
353                       'ext' :   'mp3',
354                       'url' :   final_url,
355                       'thumbnail' : info[u'thumb_url'],
356                       'uploader' :  info[u'artist']
357                       }
358
359         return [track_info]
360
361 class RedTubeIE(InfoExtractor):
362     """Information Extractor for redtube"""
363     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
364
365     def _real_extract(self,url):
366         mobj = re.match(self._VALID_URL, url)
367         if mobj is None:
368             raise ExtractorError(u'Invalid URL: %s' % url)
369
370         video_id = mobj.group('id')
371         video_extension = 'mp4'        
372         webpage = self._download_webpage(url, video_id)
373
374         self.report_extraction(video_id)
375
376         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
377             webpage, u'video URL')
378
379         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
380             webpage, u'title')
381
382         return [{
383             'id':       video_id,
384             'url':      video_url,
385             'ext':      video_extension,
386             'title':    video_title,
387         }]
388         
389 class InaIE(InfoExtractor):
390     """Information Extractor for Ina.fr"""
391     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
392
393     def _real_extract(self,url):
394         mobj = re.match(self._VALID_URL, url)
395
396         video_id = mobj.group('id')
397         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
398         video_extension = 'mp4'
399         webpage = self._download_webpage(mrss_url, video_id)
400
401         self.report_extraction(video_id)
402
403         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
404             webpage, u'video URL')
405
406         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
407             webpage, u'title')
408
409         return [{
410             'id':       video_id,
411             'url':      video_url,
412             'ext':      video_extension,
413             'title':    video_title,
414         }]
415
416 class HowcastIE(InfoExtractor):
417     """Information Extractor for Howcast.com"""
418     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
419
420     def _real_extract(self, url):
421         mobj = re.match(self._VALID_URL, url)
422
423         video_id = mobj.group('id')
424         webpage_url = 'http://www.howcast.com/videos/' + video_id
425         webpage = self._download_webpage(webpage_url, video_id)
426
427         self.report_extraction(video_id)
428
429         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
430             webpage, u'video URL')
431
432         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
433             webpage, u'title')
434
435         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
436             webpage, u'description', fatal=False)
437
438         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
439             webpage, u'thumbnail', fatal=False)
440
441         return [{
442             'id':       video_id,
443             'url':      video_url,
444             'ext':      'mp4',
445             'title':    video_title,
446             'description': video_description,
447             'thumbnail': thumbnail,
448         }]
449
450 class VineIE(InfoExtractor):
451     """Information Extractor for Vine.co"""
452     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
453
454     def _real_extract(self, url):
455         mobj = re.match(self._VALID_URL, url)
456
457         video_id = mobj.group('id')
458         webpage_url = 'https://vine.co/v/' + video_id
459         webpage = self._download_webpage(webpage_url, video_id)
460
461         self.report_extraction(video_id)
462
463         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
464             webpage, u'video URL')
465
466         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
467             webpage, u'title')
468
469         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
470             webpage, u'thumbnail', fatal=False)
471
472         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
473             webpage, u'uploader', fatal=False, flags=re.DOTALL)
474
475         return [{
476             'id':        video_id,
477             'url':       video_url,
478             'ext':       'mp4',
479             'title':     video_title,
480             'thumbnail': thumbnail,
481             'uploader':  uploader,
482         }]
483
484 class FlickrIE(InfoExtractor):
485     """Information Extractor for Flickr videos"""
486     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
487
488     def _real_extract(self, url):
489         mobj = re.match(self._VALID_URL, url)
490
491         video_id = mobj.group('id')
492         video_uploader_id = mobj.group('uploader_id')
493         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
494         webpage = self._download_webpage(webpage_url, video_id)
495
496         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
497
498         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
499         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
500
501         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
502             first_xml, u'node_id')
503
504         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
505         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
506
507         self.report_extraction(video_id)
508
509         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
510         if mobj is None:
511             raise ExtractorError(u'Unable to extract video url')
512         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
513
514         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
515             webpage, u'video title')
516
517         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
518             webpage, u'description', fatal=False)
519
520         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
521             webpage, u'thumbnail', fatal=False)
522
523         return [{
524             'id':          video_id,
525             'url':         video_url,
526             'ext':         'mp4',
527             'title':       video_title,
528             'description': video_description,
529             'thumbnail':   thumbnail,
530             'uploader_id': video_uploader_id,
531         }]
532
533 class TeamcocoIE(InfoExtractor):
534     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
535
536     def _real_extract(self, url):
537         mobj = re.match(self._VALID_URL, url)
538         if mobj is None:
539             raise ExtractorError(u'Invalid URL: %s' % url)
540         url_title = mobj.group('url_title')
541         webpage = self._download_webpage(url, url_title)
542
543         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
544             webpage, u'video id')
545
546         self.report_extraction(video_id)
547
548         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
549             webpage, u'title')
550
551         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
552             webpage, u'thumbnail', fatal=False)
553
554         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
555             webpage, u'description', fatal=False)
556
557         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
558         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
559
560         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
561             data, u'video URL')
562
563         return [{
564             'id':          video_id,
565             'url':         video_url,
566             'ext':         'mp4',
567             'title':       video_title,
568             'thumbnail':   thumbnail,
569             'description': video_description,
570         }]
571
572 class XHamsterIE(InfoExtractor):
573     """Information Extractor for xHamster"""
574     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
575
576     def _real_extract(self,url):
577         mobj = re.match(self._VALID_URL, url)
578
579         video_id = mobj.group('id')
580         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
581         webpage = self._download_webpage(mrss_url, video_id)
582
583         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
584         if mobj is None:
585             raise ExtractorError(u'Unable to extract media URL')
586         if len(mobj.group('server')) == 0:
587             video_url = compat_urllib_parse.unquote(mobj.group('file'))
588         else:
589             video_url = mobj.group('server')+'/key='+mobj.group('file')
590         video_extension = video_url.split('.')[-1]
591
592         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
593             webpage, u'title')
594
595         # Can't see the description anywhere in the UI
596         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
597         #     webpage, u'description', fatal=False)
598         # if video_description: video_description = unescapeHTML(video_description)
599
600         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
601         if mobj:
602             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
603         else:
604             video_upload_date = None
605             self._downloader.report_warning(u'Unable to extract upload date')
606
607         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
608             webpage, u'uploader id', default=u'anonymous')
609
610         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
611             webpage, u'thumbnail', fatal=False)
612
613         return [{
614             'id':       video_id,
615             'url':      video_url,
616             'ext':      video_extension,
617             'title':    video_title,
618             # 'description': video_description,
619             'upload_date': video_upload_date,
620             'uploader_id': video_uploader_id,
621             'thumbnail': video_thumbnail
622         }]
623
624 class HypemIE(InfoExtractor):
625     """Information Extractor for hypem"""
626     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
627
628     def _real_extract(self, url):
629         mobj = re.match(self._VALID_URL, url)
630         if mobj is None:
631             raise ExtractorError(u'Invalid URL: %s' % url)
632         track_id = mobj.group(1)
633
634         data = { 'ax': 1, 'ts': time.time() }
635         data_encoded = compat_urllib_parse.urlencode(data)
636         complete_url = url + "?" + data_encoded
637         request = compat_urllib_request.Request(complete_url)
638         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
639         cookie = urlh.headers.get('Set-Cookie', '')
640
641         self.report_extraction(track_id)
642
643         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
644             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
645         try:
646             track_list = json.loads(html_tracks)
647             track = track_list[u'tracks'][0]
648         except ValueError:
649             raise ExtractorError(u'Hypemachine contained invalid JSON.')
650
651         key = track[u"key"]
652         track_id = track[u"id"]
653         artist = track[u"artist"]
654         title = track[u"song"]
655
656         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
657         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
658         request.add_header('cookie', cookie)
659         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
660         try:
661             song_data = json.loads(song_data_json)
662         except ValueError:
663             raise ExtractorError(u'Hypemachine contained invalid JSON.')
664         final_url = song_data[u"url"]
665
666         return [{
667             'id':       track_id,
668             'url':      final_url,
669             'ext':      "mp3",
670             'title':    title,
671             'artist':   artist,
672         }]
673
674 class Vbox7IE(InfoExtractor):
675     """Information Extractor for Vbox7"""
676     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
677
678     def _real_extract(self,url):
679         mobj = re.match(self._VALID_URL, url)
680         if mobj is None:
681             raise ExtractorError(u'Invalid URL: %s' % url)
682         video_id = mobj.group(1)
683
684         redirect_page, urlh = self._download_webpage_handle(url, video_id)
685         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
686         redirect_url = urlh.geturl() + new_location
687         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
688
689         title = self._html_search_regex(r'<title>(.*)</title>',
690             webpage, u'title').split('/')[0].strip()
691
692         ext = "flv"
693         info_url = "http://vbox7.com/play/magare.do"
694         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
695         info_request = compat_urllib_request.Request(info_url, data)
696         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
697         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
698         if info_response is None:
699             raise ExtractorError(u'Unable to extract the media url')
700         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
701
702         return [{
703             'id':        video_id,
704             'url':       final_url,
705             'ext':       ext,
706             'title':     title,
707             'thumbnail': thumbnail_url,
708         }]
709
710
711 def gen_extractors():
712     """ Return a list of an instance of every supported extractor.
713     The order does matter; the first extractor matched is the one handling the URL.
714     """
715     return [
716         YoutubePlaylistIE(),
717         YoutubeChannelIE(),
718         YoutubeUserIE(),
719         YoutubeSearchIE(),
720         YoutubeIE(),
721         MetacafeIE(),
722         DailymotionIE(),
723         GoogleSearchIE(),
724         PhotobucketIE(),
725         YahooIE(),
726         YahooSearchIE(),
727         DepositFilesIE(),
728         FacebookIE(),
729         BlipTVIE(),
730         BlipTVUserIE(),
731         VimeoIE(),
732         MyVideoIE(),
733         ComedyCentralIE(),
734         EscapistIE(),
735         CollegeHumorIE(),
736         XVideosIE(),
737         SoundcloudSetIE(),
738         SoundcloudIE(),
739         InfoQIE(),
740         MixcloudIE(),
741         StanfordOpenClassroomIE(),
742         MTVIE(),
743         YoukuIE(),
744         XNXXIE(),
745         YouJizzIE(),
746         PornotubeIE(),
747         YouPornIE(),
748         GooglePlusIE(),
749         ArteTvIE(),
750         NBAIE(),
751         WorldStarHipHopIE(),
752         JustinTVIE(),
753         FunnyOrDieIE(),
754         SteamIE(),
755         UstreamIE(),
756         RBMARadioIE(),
757         EightTracksIE(),
758         KeekIE(),
759         TEDIE(),
760         MySpassIE(),
761         SpiegelIE(),
762         LiveLeakIE(),
763         ARDIE(),
764         ZDFIE(),
765         TumblrIE(),
766         BandcampIE(),
767         RedTubeIE(),
768         InaIE(),
769         HowcastIE(),
770         VineIE(),
771         FlickrIE(),
772         TeamcocoIE(),
773         XHamsterIE(),
774         HypemIE(),
775         Vbox7IE(),
776         GametrailersIE(),
777         StatigramIE(),
778         GenericIE()
779     ]
780
781 def get_info_extractor(ie_name):
782     """Returns the info extractor class with the given ie_name"""
783     return globals()[ie_name+'IE']