Move StanfordOC IE into its own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.myvideo import MyVideoIE
37 from .extractor.statigram import StatigramIE
38 from .extractor.photobucket import PhotobucketIE
39 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
40 from .extractor.stanfordoc import StanfordOpenClassroomIE
41 from .extractor.vimeo import VimeoIE
42 from .extractor.xvideos import XVideosIE
43 from .extractor.yahoo import YahooIE, YahooSearchIE
44 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
45 from .extractor.zdf import ZDFIE
46
47
48
49 class MixcloudIE(InfoExtractor):
50     """Information extractor for www.mixcloud.com"""
51
52     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
53     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
54     IE_NAME = u'mixcloud'
55
56     def report_download_json(self, file_id):
57         """Report JSON download."""
58         self.to_screen(u'Downloading json')
59
60     def get_urls(self, jsonData, fmt, bitrate='best'):
61         """Get urls from 'audio_formats' section in json"""
62         file_url = None
63         try:
64             bitrate_list = jsonData[fmt]
65             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
66                 bitrate = max(bitrate_list) # select highest
67
68             url_list = jsonData[fmt][bitrate]
69         except TypeError: # we have no bitrate info.
70             url_list = jsonData[fmt]
71         return url_list
72
73     def check_urls(self, url_list):
74         """Returns 1st active url from list"""
75         for url in url_list:
76             try:
77                 compat_urllib_request.urlopen(url)
78                 return url
79             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
80                 url = None
81
82         return None
83
84     def _print_formats(self, formats):
85         print('Available formats:')
86         for fmt in formats.keys():
87             for b in formats[fmt]:
88                 try:
89                     ext = formats[fmt][b][0]
90                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
91                 except TypeError: # we have no bitrate info
92                     ext = formats[fmt][0]
93                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
94                     break
95
96     def _real_extract(self, url):
97         mobj = re.match(self._VALID_URL, url)
98         if mobj is None:
99             raise ExtractorError(u'Invalid URL: %s' % url)
100         # extract uploader & filename from url
101         uploader = mobj.group(1).decode('utf-8')
102         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
103
104         # construct API request
105         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
106         # retrieve .json file with links to files
107         request = compat_urllib_request.Request(file_url)
108         try:
109             self.report_download_json(file_url)
110             jsonData = compat_urllib_request.urlopen(request).read()
111         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
112             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
113
114         # parse JSON
115         json_data = json.loads(jsonData)
116         player_url = json_data['player_swf_url']
117         formats = dict(json_data['audio_formats'])
118
119         req_format = self._downloader.params.get('format', None)
120         bitrate = None
121
122         if self._downloader.params.get('listformats', None):
123             self._print_formats(formats)
124             return
125
126         if req_format is None or req_format == 'best':
127             for format_param in formats.keys():
128                 url_list = self.get_urls(formats, format_param)
129                 # check urls
130                 file_url = self.check_urls(url_list)
131                 if file_url is not None:
132                     break # got it!
133         else:
134             if req_format not in formats:
135                 raise ExtractorError(u'Format is not available')
136
137             url_list = self.get_urls(formats, req_format)
138             file_url = self.check_urls(url_list)
139             format_param = req_format
140
141         return [{
142             'id': file_id.decode('utf-8'),
143             'url': file_url.decode('utf-8'),
144             'uploader': uploader.decode('utf-8'),
145             'upload_date': None,
146             'title': json_data['name'],
147             'ext': file_url.split('.')[-1].decode('utf-8'),
148             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
149             'thumbnail': json_data['thumbnail_url'],
150             'description': json_data['description'],
151             'player_url': player_url.decode('utf-8'),
152         }]
153
154
155 class MTVIE(InfoExtractor):
156     """Information extractor for MTV.com"""
157
158     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
159     IE_NAME = u'mtv'
160
161     def _real_extract(self, url):
162         mobj = re.match(self._VALID_URL, url)
163         if mobj is None:
164             raise ExtractorError(u'Invalid URL: %s' % url)
165         if not mobj.group('proto'):
166             url = 'http://' + url
167         video_id = mobj.group('videoid')
168
169         webpage = self._download_webpage(url, video_id)
170
171         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
172             webpage, u'song name', fatal=False)
173
174         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
175             webpage, u'title')
176
177         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
178             webpage, u'mtvn_uri', fatal=False)
179
180         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
181             webpage, u'content id', fatal=False)
182
183         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
184         self.report_extraction(video_id)
185         request = compat_urllib_request.Request(videogen_url)
186         try:
187             metadataXml = compat_urllib_request.urlopen(request).read()
188         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
189             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
190
191         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
192         renditions = mdoc.findall('.//rendition')
193
194         # For now, always pick the highest quality.
195         rendition = renditions[-1]
196
197         try:
198             _,_,ext = rendition.attrib['type'].partition('/')
199             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
200             video_url = rendition.find('./src').text
201         except KeyError:
202             raise ExtractorError('Invalid rendition field.')
203
204         info = {
205             'id': video_id,
206             'url': video_url,
207             'uploader': performer,
208             'upload_date': None,
209             'title': video_title,
210             'ext': ext,
211             'format': format,
212         }
213
214         return [info]
215
216
217 class YoukuIE(InfoExtractor):
218     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
219
220     def _gen_sid(self):
221         nowTime = int(time.time() * 1000)
222         random1 = random.randint(1000,1998)
223         random2 = random.randint(1000,9999)
224
225         return "%d%d%d" %(nowTime,random1,random2)
226
227     def _get_file_ID_mix_string(self, seed):
228         mixed = []
229         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
230         seed = float(seed)
231         for i in range(len(source)):
232             seed  =  (seed * 211 + 30031 ) % 65536
233             index  =  math.floor(seed / 65536 * len(source) )
234             mixed.append(source[int(index)])
235             source.remove(source[int(index)])
236         #return ''.join(mixed)
237         return mixed
238
239     def _get_file_id(self, fileId, seed):
240         mixed = self._get_file_ID_mix_string(seed)
241         ids = fileId.split('*')
242         realId = []
243         for ch in ids:
244             if ch:
245                 realId.append(mixed[int(ch)])
246         return ''.join(realId)
247
248     def _real_extract(self, url):
249         mobj = re.match(self._VALID_URL, url)
250         if mobj is None:
251             raise ExtractorError(u'Invalid URL: %s' % url)
252         video_id = mobj.group('ID')
253
254         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
255
256         jsondata = self._download_webpage(info_url, video_id)
257
258         self.report_extraction(video_id)
259         try:
260             config = json.loads(jsondata)
261
262             video_title =  config['data'][0]['title']
263             seed = config['data'][0]['seed']
264
265             format = self._downloader.params.get('format', None)
266             supported_format = list(config['data'][0]['streamfileids'].keys())
267
268             if format is None or format == 'best':
269                 if 'hd2' in supported_format:
270                     format = 'hd2'
271                 else:
272                     format = 'flv'
273                 ext = u'flv'
274             elif format == 'worst':
275                 format = 'mp4'
276                 ext = u'mp4'
277             else:
278                 format = 'flv'
279                 ext = u'flv'
280
281
282             fileid = config['data'][0]['streamfileids'][format]
283             keys = [s['k'] for s in config['data'][0]['segs'][format]]
284         except (UnicodeDecodeError, ValueError, KeyError):
285             raise ExtractorError(u'Unable to extract info section')
286
287         files_info=[]
288         sid = self._gen_sid()
289         fileid = self._get_file_id(fileid, seed)
290
291         #column 8,9 of fileid represent the segment number
292         #fileid[7:9] should be changed
293         for index, key in enumerate(keys):
294
295             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
296             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
297
298             info = {
299                 'id': '%s_part%02d' % (video_id, index),
300                 'url': download_url,
301                 'uploader': None,
302                 'upload_date': None,
303                 'title': video_title,
304                 'ext': ext,
305             }
306             files_info.append(info)
307
308         return files_info
309
310
311 class XNXXIE(InfoExtractor):
312     """Information extractor for xnxx.com"""
313
314     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
315     IE_NAME = u'xnxx'
316     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
317     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
318     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
319
320     def _real_extract(self, url):
321         mobj = re.match(self._VALID_URL, url)
322         if mobj is None:
323             raise ExtractorError(u'Invalid URL: %s' % url)
324         video_id = mobj.group(1)
325
326         # Get webpage content
327         webpage = self._download_webpage(url, video_id)
328
329         video_url = self._search_regex(self.VIDEO_URL_RE,
330             webpage, u'video URL')
331         video_url = compat_urllib_parse.unquote(video_url)
332
333         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
334             webpage, u'title')
335
336         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
337             webpage, u'thumbnail', fatal=False)
338
339         return [{
340             'id': video_id,
341             'url': video_url,
342             'uploader': None,
343             'upload_date': None,
344             'title': video_title,
345             'ext': 'flv',
346             'thumbnail': video_thumbnail,
347             'description': None,
348         }]
349
350
351
352 class NBAIE(InfoExtractor):
353     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
354     IE_NAME = u'nba'
355
356     def _real_extract(self, url):
357         mobj = re.match(self._VALID_URL, url)
358         if mobj is None:
359             raise ExtractorError(u'Invalid URL: %s' % url)
360
361         video_id = mobj.group(1)
362
363         webpage = self._download_webpage(url, video_id)
364
365         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
366
367         shortened_video_id = video_id.rpartition('/')[2]
368         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
369             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
370
371         # It isn't there in the HTML it returns to us
372         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
373
374         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
375
376         info = {
377             'id': shortened_video_id,
378             'url': video_url,
379             'ext': 'mp4',
380             'title': title,
381             # 'uploader_date': uploader_date,
382             'description': description,
383         }
384         return [info]
385
386 class JustinTVIE(InfoExtractor):
387     """Information extractor for justin.tv and twitch.tv"""
388     # TODO: One broadcast may be split into multiple videos. The key
389     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
390     # starts at 1 and increases. Can we treat all parts as one video?
391
392     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
393         (?:
394             (?P<channelid>[^/]+)|
395             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
396             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
397         )
398         /?(?:\#.*)?$
399         """
400     _JUSTIN_PAGE_LIMIT = 100
401     IE_NAME = u'justin.tv'
402
403     def report_download_page(self, channel, offset):
404         """Report attempt to download a single page of videos."""
405         self.to_screen(u'%s: Downloading video information from %d to %d' %
406                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
407
408     # Return count of items, list of *valid* items
409     def _parse_page(self, url, video_id):
410         webpage = self._download_webpage(url, video_id,
411                                          u'Downloading video info JSON',
412                                          u'unable to download video info JSON')
413
414         response = json.loads(webpage)
415         if type(response) != list:
416             error_text = response.get('error', 'unknown error')
417             raise ExtractorError(u'Justin.tv API: %s' % error_text)
418         info = []
419         for clip in response:
420             video_url = clip['video_file_url']
421             if video_url:
422                 video_extension = os.path.splitext(video_url)[1][1:]
423                 video_date = re.sub('-', '', clip['start_time'][:10])
424                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
425                 video_id = clip['id']
426                 video_title = clip.get('title', video_id)
427                 info.append({
428                     'id': video_id,
429                     'url': video_url,
430                     'title': video_title,
431                     'uploader': clip.get('channel_name', video_uploader_id),
432                     'uploader_id': video_uploader_id,
433                     'upload_date': video_date,
434                     'ext': video_extension,
435                 })
436         return (len(response), info)
437
438     def _real_extract(self, url):
439         mobj = re.match(self._VALID_URL, url)
440         if mobj is None:
441             raise ExtractorError(u'invalid URL: %s' % url)
442
443         api_base = 'http://api.justin.tv'
444         paged = False
445         if mobj.group('channelid'):
446             paged = True
447             video_id = mobj.group('channelid')
448             api = api_base + '/channel/archives/%s.json' % video_id
449         elif mobj.group('chapterid'):
450             chapter_id = mobj.group('chapterid')
451
452             webpage = self._download_webpage(url, chapter_id)
453             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
454             if not m:
455                 raise ExtractorError(u'Cannot find archive of a chapter')
456             archive_id = m.group(1)
457
458             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
459             chapter_info_xml = self._download_webpage(api, chapter_id,
460                                              note=u'Downloading chapter information',
461                                              errnote=u'Chapter information download failed')
462             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
463             for a in doc.findall('.//archive'):
464                 if archive_id == a.find('./id').text:
465                     break
466             else:
467                 raise ExtractorError(u'Could not find chapter in chapter information')
468
469             video_url = a.find('./video_file_url').text
470             video_ext = video_url.rpartition('.')[2] or u'flv'
471
472             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
473             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
474                                    note='Downloading chapter metadata',
475                                    errnote='Download of chapter metadata failed')
476             chapter_info = json.loads(chapter_info_json)
477
478             bracket_start = int(doc.find('.//bracket_start').text)
479             bracket_end = int(doc.find('.//bracket_end').text)
480
481             # TODO determine start (and probably fix up file)
482             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
483             #video_url += u'?start=' + TODO:start_timestamp
484             # bracket_start is 13290, but we want 51670615
485             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
486                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
487
488             info = {
489                 'id': u'c' + chapter_id,
490                 'url': video_url,
491                 'ext': video_ext,
492                 'title': chapter_info['title'],
493                 'thumbnail': chapter_info['preview'],
494                 'description': chapter_info['description'],
495                 'uploader': chapter_info['channel']['display_name'],
496                 'uploader_id': chapter_info['channel']['name'],
497             }
498             return [info]
499         else:
500             video_id = mobj.group('videoid')
501             api = api_base + '/broadcast/by_archive/%s.json' % video_id
502
503         self.report_extraction(video_id)
504
505         info = []
506         offset = 0
507         limit = self._JUSTIN_PAGE_LIMIT
508         while True:
509             if paged:
510                 self.report_download_page(video_id, offset)
511             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
512             page_count, page_info = self._parse_page(page_url, video_id)
513             info.extend(page_info)
514             if not paged or page_count != limit:
515                 break
516             offset += limit
517         return info
518
519 class FunnyOrDieIE(InfoExtractor):
520     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
521
522     def _real_extract(self, url):
523         mobj = re.match(self._VALID_URL, url)
524         if mobj is None:
525             raise ExtractorError(u'invalid URL: %s' % url)
526
527         video_id = mobj.group('id')
528         webpage = self._download_webpage(url, video_id)
529
530         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
531             webpage, u'video URL', flags=re.DOTALL)
532
533         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
534             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
535
536         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
537             webpage, u'description', fatal=False, flags=re.DOTALL)
538
539         info = {
540             'id': video_id,
541             'url': video_url,
542             'ext': 'mp4',
543             'title': title,
544             'description': video_description,
545         }
546         return [info]
547
548 class SteamIE(InfoExtractor):
549     _VALID_URL = r"""http://store\.steampowered\.com/
550                 (agecheck/)?
551                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
552                 (?P<gameID>\d+)/?
553                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
554                 """
555     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
556     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
557
558     @classmethod
559     def suitable(cls, url):
560         """Receives a URL and returns True if suitable for this IE."""
561         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
562
563     def _real_extract(self, url):
564         m = re.match(self._VALID_URL, url, re.VERBOSE)
565         gameID = m.group('gameID')
566
567         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
568         webpage = self._download_webpage(videourl, gameID)
569
570         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
571             videourl = self._AGECHECK_TEMPLATE % gameID
572             self.report_age_confirmation()
573             webpage = self._download_webpage(videourl, gameID)
574
575         self.report_extraction(gameID)
576         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
577                                              webpage, 'game title')
578
579         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
580         mweb = re.finditer(urlRE, webpage)
581         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
582         titles = re.finditer(namesRE, webpage)
583         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
584         thumbs = re.finditer(thumbsRE, webpage)
585         videos = []
586         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
587             video_id = vid.group('videoID')
588             title = vtitle.group('videoName')
589             video_url = vid.group('videoURL')
590             video_thumb = thumb.group('thumbnail')
591             if not video_url:
592                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
593             info = {
594                 'id':video_id,
595                 'url':video_url,
596                 'ext': 'flv',
597                 'title': unescapeHTML(title),
598                 'thumbnail': video_thumb
599                   }
600             videos.append(info)
601         return [self.playlist_result(videos, gameID, game_title)]
602
603 class UstreamIE(InfoExtractor):
604     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
605     IE_NAME = u'ustream'
606
607     def _real_extract(self, url):
608         m = re.match(self._VALID_URL, url)
609         video_id = m.group('videoID')
610
611         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
612         webpage = self._download_webpage(url, video_id)
613
614         self.report_extraction(video_id)
615
616         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
617             webpage, u'title')
618
619         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
620             webpage, u'uploader', fatal=False, flags=re.DOTALL)
621
622         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
623             webpage, u'thumbnail', fatal=False)
624
625         info = {
626                 'id': video_id,
627                 'url': video_url,
628                 'ext': 'flv',
629                 'title': video_title,
630                 'uploader': uploader,
631                 'thumbnail': thumbnail,
632                }
633         return info
634
635 class WorldStarHipHopIE(InfoExtractor):
636     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
637     IE_NAME = u'WorldStarHipHop'
638
639     def _real_extract(self, url):
640         m = re.match(self._VALID_URL, url)
641         video_id = m.group('id')
642
643         webpage_src = self._download_webpage(url, video_id)
644
645         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
646             webpage_src, u'video URL')
647
648         if 'mp4' in video_url:
649             ext = 'mp4'
650         else:
651             ext = 'flv'
652
653         video_title = self._html_search_regex(r"<title>(.*)</title>",
654             webpage_src, u'title')
655
656         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
657         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
658             webpage_src, u'thumbnail', fatal=False)
659
660         if not thumbnail:
661             _title = r"""candytitles.*>(.*)</span>"""
662             mobj = re.search(_title, webpage_src)
663             if mobj is not None:
664                 video_title = mobj.group(1)
665
666         results = [{
667                     'id': video_id,
668                     'url' : video_url,
669                     'title' : video_title,
670                     'thumbnail' : thumbnail,
671                     'ext' : ext,
672                     }]
673         return results
674
675 class RBMARadioIE(InfoExtractor):
676     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
677
678     def _real_extract(self, url):
679         m = re.match(self._VALID_URL, url)
680         video_id = m.group('videoID')
681
682         webpage = self._download_webpage(url, video_id)
683
684         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
685             webpage, u'json data', flags=re.MULTILINE)
686
687         try:
688             data = json.loads(json_data)
689         except ValueError as e:
690             raise ExtractorError(u'Invalid JSON: ' + str(e))
691
692         video_url = data['akamai_url'] + '&cbr=256'
693         url_parts = compat_urllib_parse_urlparse(video_url)
694         video_ext = url_parts.path.rpartition('.')[2]
695         info = {
696                 'id': video_id,
697                 'url': video_url,
698                 'ext': video_ext,
699                 'title': data['title'],
700                 'description': data.get('teaser_text'),
701                 'location': data.get('country_of_origin'),
702                 'uploader': data.get('host', {}).get('name'),
703                 'uploader_id': data.get('host', {}).get('slug'),
704                 'thumbnail': data.get('image', {}).get('large_url_2x'),
705                 'duration': data.get('duration'),
706         }
707         return [info]
708
709
710 class YouPornIE(InfoExtractor):
711     """Information extractor for youporn.com."""
712     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
713
714     def _print_formats(self, formats):
715         """Print all available formats"""
716         print(u'Available formats:')
717         print(u'ext\t\tformat')
718         print(u'---------------------------------')
719         for format in formats:
720             print(u'%s\t\t%s'  % (format['ext'], format['format']))
721
722     def _specific(self, req_format, formats):
723         for x in formats:
724             if(x["format"]==req_format):
725                 return x
726         return None
727
728     def _real_extract(self, url):
729         mobj = re.match(self._VALID_URL, url)
730         if mobj is None:
731             raise ExtractorError(u'Invalid URL: %s' % url)
732         video_id = mobj.group('videoid')
733
734         req = compat_urllib_request.Request(url)
735         req.add_header('Cookie', 'age_verified=1')
736         webpage = self._download_webpage(req, video_id)
737
738         # Get JSON parameters
739         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
740         try:
741             params = json.loads(json_params)
742         except:
743             raise ExtractorError(u'Invalid JSON')
744
745         self.report_extraction(video_id)
746         try:
747             video_title = params['title']
748             upload_date = unified_strdate(params['release_date_f'])
749             video_description = params['description']
750             video_uploader = params['submitted_by']
751             thumbnail = params['thumbnails'][0]['image']
752         except KeyError:
753             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
754
755         # Get all of the formats available
756         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
757         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
758             webpage, u'download list').strip()
759
760         # Get all of the links from the page
761         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
762         links = re.findall(LINK_RE, download_list_html)
763         if(len(links) == 0):
764             raise ExtractorError(u'ERROR: no known formats available for video')
765
766         self.to_screen(u'Links found: %d' % len(links))
767
768         formats = []
769         for link in links:
770
771             # A link looks like this:
772             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
773             # A path looks like this:
774             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
775             video_url = unescapeHTML( link )
776             path = compat_urllib_parse_urlparse( video_url ).path
777             extension = os.path.splitext( path )[1][1:]
778             format = path.split('/')[4].split('_')[:2]
779             size = format[0]
780             bitrate = format[1]
781             format = "-".join( format )
782             # title = u'%s-%s-%s' % (video_title, size, bitrate)
783
784             formats.append({
785                 'id': video_id,
786                 'url': video_url,
787                 'uploader': video_uploader,
788                 'upload_date': upload_date,
789                 'title': video_title,
790                 'ext': extension,
791                 'format': format,
792                 'thumbnail': thumbnail,
793                 'description': video_description
794             })
795
796         if self._downloader.params.get('listformats', None):
797             self._print_formats(formats)
798             return
799
800         req_format = self._downloader.params.get('format', None)
801         self.to_screen(u'Format: %s' % req_format)
802
803         if req_format is None or req_format == 'best':
804             return [formats[0]]
805         elif req_format == 'worst':
806             return [formats[-1]]
807         elif req_format in ('-1', 'all'):
808             return formats
809         else:
810             format = self._specific( req_format, formats )
811             if result is None:
812                 raise ExtractorError(u'Requested format not available')
813             return [format]
814
815
816
817 class PornotubeIE(InfoExtractor):
818     """Information extractor for pornotube.com."""
819     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
820
821     def _real_extract(self, url):
822         mobj = re.match(self._VALID_URL, url)
823         if mobj is None:
824             raise ExtractorError(u'Invalid URL: %s' % url)
825
826         video_id = mobj.group('videoid')
827         video_title = mobj.group('title')
828
829         # Get webpage content
830         webpage = self._download_webpage(url, video_id)
831
832         # Get the video URL
833         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
834         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
835         video_url = compat_urllib_parse.unquote(video_url)
836
837         #Get the uploaded date
838         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
839         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
840         if upload_date: upload_date = unified_strdate(upload_date)
841
842         info = {'id': video_id,
843                 'url': video_url,
844                 'uploader': None,
845                 'upload_date': upload_date,
846                 'title': video_title,
847                 'ext': 'flv',
848                 'format': 'flv'}
849
850         return [info]
851
852 class YouJizzIE(InfoExtractor):
853     """Information extractor for youjizz.com."""
854     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
855
856     def _real_extract(self, url):
857         mobj = re.match(self._VALID_URL, url)
858         if mobj is None:
859             raise ExtractorError(u'Invalid URL: %s' % url)
860
861         video_id = mobj.group('videoid')
862
863         # Get webpage content
864         webpage = self._download_webpage(url, video_id)
865
866         # Get the video title
867         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
868             webpage, u'title').strip()
869
870         # Get the embed page
871         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
872         if result is None:
873             raise ExtractorError(u'ERROR: unable to extract embed page')
874
875         embed_page_url = result.group(0).strip()
876         video_id = result.group('videoid')
877
878         webpage = self._download_webpage(embed_page_url, video_id)
879
880         # Get the video URL
881         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
882             webpage, u'video URL')
883
884         info = {'id': video_id,
885                 'url': video_url,
886                 'title': video_title,
887                 'ext': 'flv',
888                 'format': 'flv',
889                 'player_url': embed_page_url}
890
891         return [info]
892
893 class EightTracksIE(InfoExtractor):
894     IE_NAME = '8tracks'
895     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
896
897     def _real_extract(self, url):
898         mobj = re.match(self._VALID_URL, url)
899         if mobj is None:
900             raise ExtractorError(u'Invalid URL: %s' % url)
901         playlist_id = mobj.group('id')
902
903         webpage = self._download_webpage(url, playlist_id)
904
905         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
906         data = json.loads(json_like)
907
908         session = str(random.randint(0, 1000000000))
909         mix_id = data['id']
910         track_count = data['tracks_count']
911         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
912         next_url = first_url
913         res = []
914         for i in itertools.count():
915             api_json = self._download_webpage(next_url, playlist_id,
916                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
917                 errnote=u'Failed to download song information')
918             api_data = json.loads(api_json)
919             track_data = api_data[u'set']['track']
920             info = {
921                 'id': track_data['id'],
922                 'url': track_data['track_file_stream_url'],
923                 'title': track_data['performer'] + u' - ' + track_data['name'],
924                 'raw_title': track_data['name'],
925                 'uploader_id': data['user']['login'],
926                 'ext': 'm4a',
927             }
928             res.append(info)
929             if api_data['set']['at_last_track']:
930                 break
931             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
932         return res
933
934 class KeekIE(InfoExtractor):
935     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
936     IE_NAME = u'keek'
937
938     def _real_extract(self, url):
939         m = re.match(self._VALID_URL, url)
940         video_id = m.group('videoID')
941
942         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
943         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
944         webpage = self._download_webpage(url, video_id)
945
946         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
947             webpage, u'title')
948
949         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
950             webpage, u'uploader', fatal=False)
951
952         info = {
953                 'id': video_id,
954                 'url': video_url,
955                 'ext': 'mp4',
956                 'title': video_title,
957                 'thumbnail': thumbnail,
958                 'uploader': uploader
959         }
960         return [info]
961
962 class TEDIE(InfoExtractor):
963     _VALID_URL=r'''http://www\.ted\.com/
964                    (
965                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
966                         |
967                         ((?P<type_talk>talks)) # We have a simple talk
968                    )
969                    (/lang/(.*?))? # The url may contain the language
970                    /(?P<name>\w+) # Here goes the name and then ".html"
971                    '''
972
973     @classmethod
974     def suitable(cls, url):
975         """Receives a URL and returns True if suitable for this IE."""
976         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
977
978     def _real_extract(self, url):
979         m=re.match(self._VALID_URL, url, re.VERBOSE)
980         if m.group('type_talk'):
981             return [self._talk_info(url)]
982         else :
983             playlist_id=m.group('playlist_id')
984             name=m.group('name')
985             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
986             return [self._playlist_videos_info(url,name,playlist_id)]
987
988     def _playlist_videos_info(self,url,name,playlist_id=0):
989         '''Returns the videos of the playlist'''
990         video_RE=r'''
991                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
992                      ([.\s]*?)data-playlist_item_id="(\d+)"
993                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
994                      '''
995         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
996         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
997         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
998         m_names=re.finditer(video_name_RE,webpage)
999
1000         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1001                                                  webpage, 'playlist title')
1002
1003         playlist_entries = []
1004         for m_video, m_name in zip(m_videos,m_names):
1005             video_id=m_video.group('video_id')
1006             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1007             playlist_entries.append(self.url_result(talk_url, 'TED'))
1008         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1009
1010     def _talk_info(self, url, video_id=0):
1011         """Return the video for the talk in the url"""
1012         m = re.match(self._VALID_URL, url,re.VERBOSE)
1013         video_name = m.group('name')
1014         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1015         self.report_extraction(video_name)
1016         # If the url includes the language we get the title translated
1017         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1018                                         webpage, 'title')
1019         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1020                                     webpage, 'json data')
1021         info = json.loads(json_data)
1022         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1023                                        webpage, 'description', flags = re.DOTALL)
1024         
1025         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1026                                        webpage, 'thumbnail')
1027         info = {
1028                 'id': info['id'],
1029                 'url': info['htmlStreams'][-1]['file'],
1030                 'ext': 'mp4',
1031                 'title': title,
1032                 'thumbnail': thumbnail,
1033                 'description': desc,
1034                 }
1035         return info
1036
1037 class MySpassIE(InfoExtractor):
1038     _VALID_URL = r'http://www.myspass.de/.*'
1039
1040     def _real_extract(self, url):
1041         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1042
1043         # video id is the last path element of the URL
1044         # usually there is a trailing slash, so also try the second but last
1045         url_path = compat_urllib_parse_urlparse(url).path
1046         url_parent_path, video_id = os.path.split(url_path)
1047         if not video_id:
1048             _, video_id = os.path.split(url_parent_path)
1049
1050         # get metadata
1051         metadata_url = META_DATA_URL_TEMPLATE % video_id
1052         metadata_text = self._download_webpage(metadata_url, video_id)
1053         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1054
1055         # extract values from metadata
1056         url_flv_el = metadata.find('url_flv')
1057         if url_flv_el is None:
1058             raise ExtractorError(u'Unable to extract download url')
1059         video_url = url_flv_el.text
1060         extension = os.path.splitext(video_url)[1][1:]
1061         title_el = metadata.find('title')
1062         if title_el is None:
1063             raise ExtractorError(u'Unable to extract title')
1064         title = title_el.text
1065         format_id_el = metadata.find('format_id')
1066         if format_id_el is None:
1067             format = ext
1068         else:
1069             format = format_id_el.text
1070         description_el = metadata.find('description')
1071         if description_el is not None:
1072             description = description_el.text
1073         else:
1074             description = None
1075         imagePreview_el = metadata.find('imagePreview')
1076         if imagePreview_el is not None:
1077             thumbnail = imagePreview_el.text
1078         else:
1079             thumbnail = None
1080         info = {
1081             'id': video_id,
1082             'url': video_url,
1083             'title': title,
1084             'ext': extension,
1085             'format': format,
1086             'thumbnail': thumbnail,
1087             'description': description
1088         }
1089         return [info]
1090
1091 class SpiegelIE(InfoExtractor):
1092     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1093
1094     def _real_extract(self, url):
1095         m = re.match(self._VALID_URL, url)
1096         video_id = m.group('videoID')
1097
1098         webpage = self._download_webpage(url, video_id)
1099
1100         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1101             webpage, u'title')
1102
1103         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1104         xml_code = self._download_webpage(xml_url, video_id,
1105                     note=u'Downloading XML', errnote=u'Failed to download XML')
1106
1107         idoc = xml.etree.ElementTree.fromstring(xml_code)
1108         last_type = idoc[-1]
1109         filename = last_type.findall('./filename')[0].text
1110         duration = float(last_type.findall('./duration')[0].text)
1111
1112         video_url = 'http://video2.spiegel.de/flash/' + filename
1113         video_ext = filename.rpartition('.')[2]
1114         info = {
1115             'id': video_id,
1116             'url': video_url,
1117             'ext': video_ext,
1118             'title': video_title,
1119             'duration': duration,
1120         }
1121         return [info]
1122
1123 class LiveLeakIE(InfoExtractor):
1124
1125     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1126     IE_NAME = u'liveleak'
1127
1128     def _real_extract(self, url):
1129         mobj = re.match(self._VALID_URL, url)
1130         if mobj is None:
1131             raise ExtractorError(u'Invalid URL: %s' % url)
1132
1133         video_id = mobj.group('video_id')
1134
1135         webpage = self._download_webpage(url, video_id)
1136
1137         video_url = self._search_regex(r'file: "(.*?)",',
1138             webpage, u'video URL')
1139
1140         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1141             webpage, u'title').replace('LiveLeak.com -', '').strip()
1142
1143         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1144             webpage, u'description', fatal=False)
1145
1146         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1147             webpage, u'uploader', fatal=False)
1148
1149         info = {
1150             'id':  video_id,
1151             'url': video_url,
1152             'ext': 'mp4',
1153             'title': video_title,
1154             'description': video_description,
1155             'uploader': video_uploader
1156         }
1157
1158         return [info]
1159
1160
1161
1162 class TumblrIE(InfoExtractor):
1163     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1164
1165     def _real_extract(self, url):
1166         m_url = re.match(self._VALID_URL, url)
1167         video_id = m_url.group('id')
1168         blog = m_url.group('blog_name')
1169
1170         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1171         webpage = self._download_webpage(url, video_id)
1172
1173         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1174         video = re.search(re_video, webpage)
1175         if video is None:
1176            raise ExtractorError(u'Unable to extract video')
1177         video_url = video.group('video_url')
1178         ext = video.group('ext')
1179
1180         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1181             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1182         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1183
1184         # The only place where you can get a title, it's not complete,
1185         # but searching in other places doesn't work for all videos
1186         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1187             webpage, u'title', flags=re.DOTALL)
1188
1189         return [{'id': video_id,
1190                  'url': video_url,
1191                  'title': video_title,
1192                  'thumbnail': video_thumbnail,
1193                  'ext': ext
1194                  }]
1195
1196 class BandcampIE(InfoExtractor):
1197     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1198
1199     def _real_extract(self, url):
1200         mobj = re.match(self._VALID_URL, url)
1201         title = mobj.group('title')
1202         webpage = self._download_webpage(url, title)
1203         # We get the link to the free download page
1204         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1205         if m_download is None:
1206             raise ExtractorError(u'No free songs found')
1207
1208         download_link = m_download.group(1)
1209         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
1210                        webpage, re.MULTILINE|re.DOTALL).group('id')
1211
1212         download_webpage = self._download_webpage(download_link, id,
1213                                                   'Downloading free downloads page')
1214         # We get the dictionary of the track from some javascrip code
1215         info = re.search(r'items: (.*?),$',
1216                          download_webpage, re.MULTILINE).group(1)
1217         info = json.loads(info)[0]
1218         # We pick mp3-320 for now, until format selection can be easily implemented.
1219         mp3_info = info[u'downloads'][u'mp3-320']
1220         # If we try to use this url it says the link has expired
1221         initial_url = mp3_info[u'url']
1222         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1223         m_url = re.match(re_url, initial_url)
1224         #We build the url we will use to get the final track url
1225         # This url is build in Bandcamp in the script download_bunde_*.js
1226         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1227         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1228         # If we could correctly generate the .rand field the url would be
1229         #in the "download_url" key
1230         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1231
1232         track_info = {'id':id,
1233                       'title' : info[u'title'],
1234                       'ext' :   'mp3',
1235                       'url' :   final_url,
1236                       'thumbnail' : info[u'thumb_url'],
1237                       'uploader' :  info[u'artist']
1238                       }
1239
1240         return [track_info]
1241
1242 class RedTubeIE(InfoExtractor):
1243     """Information Extractor for redtube"""
1244     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1245
1246     def _real_extract(self,url):
1247         mobj = re.match(self._VALID_URL, url)
1248         if mobj is None:
1249             raise ExtractorError(u'Invalid URL: %s' % url)
1250
1251         video_id = mobj.group('id')
1252         video_extension = 'mp4'        
1253         webpage = self._download_webpage(url, video_id)
1254
1255         self.report_extraction(video_id)
1256
1257         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1258             webpage, u'video URL')
1259
1260         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1261             webpage, u'title')
1262
1263         return [{
1264             'id':       video_id,
1265             'url':      video_url,
1266             'ext':      video_extension,
1267             'title':    video_title,
1268         }]
1269         
1270 class InaIE(InfoExtractor):
1271     """Information Extractor for Ina.fr"""
1272     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1273
1274     def _real_extract(self,url):
1275         mobj = re.match(self._VALID_URL, url)
1276
1277         video_id = mobj.group('id')
1278         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1279         video_extension = 'mp4'
1280         webpage = self._download_webpage(mrss_url, video_id)
1281
1282         self.report_extraction(video_id)
1283
1284         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1285             webpage, u'video URL')
1286
1287         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1288             webpage, u'title')
1289
1290         return [{
1291             'id':       video_id,
1292             'url':      video_url,
1293             'ext':      video_extension,
1294             'title':    video_title,
1295         }]
1296
1297 class HowcastIE(InfoExtractor):
1298     """Information Extractor for Howcast.com"""
1299     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1300
1301     def _real_extract(self, url):
1302         mobj = re.match(self._VALID_URL, url)
1303
1304         video_id = mobj.group('id')
1305         webpage_url = 'http://www.howcast.com/videos/' + video_id
1306         webpage = self._download_webpage(webpage_url, video_id)
1307
1308         self.report_extraction(video_id)
1309
1310         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1311             webpage, u'video URL')
1312
1313         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1314             webpage, u'title')
1315
1316         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1317             webpage, u'description', fatal=False)
1318
1319         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1320             webpage, u'thumbnail', fatal=False)
1321
1322         return [{
1323             'id':       video_id,
1324             'url':      video_url,
1325             'ext':      'mp4',
1326             'title':    video_title,
1327             'description': video_description,
1328             'thumbnail': thumbnail,
1329         }]
1330
1331 class VineIE(InfoExtractor):
1332     """Information Extractor for Vine.co"""
1333     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1334
1335     def _real_extract(self, url):
1336         mobj = re.match(self._VALID_URL, url)
1337
1338         video_id = mobj.group('id')
1339         webpage_url = 'https://vine.co/v/' + video_id
1340         webpage = self._download_webpage(webpage_url, video_id)
1341
1342         self.report_extraction(video_id)
1343
1344         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1345             webpage, u'video URL')
1346
1347         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1348             webpage, u'title')
1349
1350         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1351             webpage, u'thumbnail', fatal=False)
1352
1353         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1354             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1355
1356         return [{
1357             'id':        video_id,
1358             'url':       video_url,
1359             'ext':       'mp4',
1360             'title':     video_title,
1361             'thumbnail': thumbnail,
1362             'uploader':  uploader,
1363         }]
1364
1365 class FlickrIE(InfoExtractor):
1366     """Information Extractor for Flickr videos"""
1367     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1368
1369     def _real_extract(self, url):
1370         mobj = re.match(self._VALID_URL, url)
1371
1372         video_id = mobj.group('id')
1373         video_uploader_id = mobj.group('uploader_id')
1374         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1375         webpage = self._download_webpage(webpage_url, video_id)
1376
1377         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1378
1379         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1380         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1381
1382         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1383             first_xml, u'node_id')
1384
1385         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1386         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1387
1388         self.report_extraction(video_id)
1389
1390         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1391         if mobj is None:
1392             raise ExtractorError(u'Unable to extract video url')
1393         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1394
1395         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1396             webpage, u'video title')
1397
1398         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1399             webpage, u'description', fatal=False)
1400
1401         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1402             webpage, u'thumbnail', fatal=False)
1403
1404         return [{
1405             'id':          video_id,
1406             'url':         video_url,
1407             'ext':         'mp4',
1408             'title':       video_title,
1409             'description': video_description,
1410             'thumbnail':   thumbnail,
1411             'uploader_id': video_uploader_id,
1412         }]
1413
1414 class TeamcocoIE(InfoExtractor):
1415     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1416
1417     def _real_extract(self, url):
1418         mobj = re.match(self._VALID_URL, url)
1419         if mobj is None:
1420             raise ExtractorError(u'Invalid URL: %s' % url)
1421         url_title = mobj.group('url_title')
1422         webpage = self._download_webpage(url, url_title)
1423
1424         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1425             webpage, u'video id')
1426
1427         self.report_extraction(video_id)
1428
1429         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1430             webpage, u'title')
1431
1432         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1433             webpage, u'thumbnail', fatal=False)
1434
1435         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1436             webpage, u'description', fatal=False)
1437
1438         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1439         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1440
1441         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1442             data, u'video URL')
1443
1444         return [{
1445             'id':          video_id,
1446             'url':         video_url,
1447             'ext':         'mp4',
1448             'title':       video_title,
1449             'thumbnail':   thumbnail,
1450             'description': video_description,
1451         }]
1452
1453 class XHamsterIE(InfoExtractor):
1454     """Information Extractor for xHamster"""
1455     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1456
1457     def _real_extract(self,url):
1458         mobj = re.match(self._VALID_URL, url)
1459
1460         video_id = mobj.group('id')
1461         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1462         webpage = self._download_webpage(mrss_url, video_id)
1463
1464         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1465         if mobj is None:
1466             raise ExtractorError(u'Unable to extract media URL')
1467         if len(mobj.group('server')) == 0:
1468             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1469         else:
1470             video_url = mobj.group('server')+'/key='+mobj.group('file')
1471         video_extension = video_url.split('.')[-1]
1472
1473         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1474             webpage, u'title')
1475
1476         # Can't see the description anywhere in the UI
1477         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1478         #     webpage, u'description', fatal=False)
1479         # if video_description: video_description = unescapeHTML(video_description)
1480
1481         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1482         if mobj:
1483             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1484         else:
1485             video_upload_date = None
1486             self._downloader.report_warning(u'Unable to extract upload date')
1487
1488         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1489             webpage, u'uploader id', default=u'anonymous')
1490
1491         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1492             webpage, u'thumbnail', fatal=False)
1493
1494         return [{
1495             'id':       video_id,
1496             'url':      video_url,
1497             'ext':      video_extension,
1498             'title':    video_title,
1499             # 'description': video_description,
1500             'upload_date': video_upload_date,
1501             'uploader_id': video_uploader_id,
1502             'thumbnail': video_thumbnail
1503         }]
1504
1505 class HypemIE(InfoExtractor):
1506     """Information Extractor for hypem"""
1507     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1508
1509     def _real_extract(self, url):
1510         mobj = re.match(self._VALID_URL, url)
1511         if mobj is None:
1512             raise ExtractorError(u'Invalid URL: %s' % url)
1513         track_id = mobj.group(1)
1514
1515         data = { 'ax': 1, 'ts': time.time() }
1516         data_encoded = compat_urllib_parse.urlencode(data)
1517         complete_url = url + "?" + data_encoded
1518         request = compat_urllib_request.Request(complete_url)
1519         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1520         cookie = urlh.headers.get('Set-Cookie', '')
1521
1522         self.report_extraction(track_id)
1523
1524         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1525             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1526         try:
1527             track_list = json.loads(html_tracks)
1528             track = track_list[u'tracks'][0]
1529         except ValueError:
1530             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1531
1532         key = track[u"key"]
1533         track_id = track[u"id"]
1534         artist = track[u"artist"]
1535         title = track[u"song"]
1536
1537         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1538         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1539         request.add_header('cookie', cookie)
1540         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1541         try:
1542             song_data = json.loads(song_data_json)
1543         except ValueError:
1544             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1545         final_url = song_data[u"url"]
1546
1547         return [{
1548             'id':       track_id,
1549             'url':      final_url,
1550             'ext':      "mp3",
1551             'title':    title,
1552             'artist':   artist,
1553         }]
1554
1555 class Vbox7IE(InfoExtractor):
1556     """Information Extractor for Vbox7"""
1557     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1558
1559     def _real_extract(self,url):
1560         mobj = re.match(self._VALID_URL, url)
1561         if mobj is None:
1562             raise ExtractorError(u'Invalid URL: %s' % url)
1563         video_id = mobj.group(1)
1564
1565         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1566         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1567         redirect_url = urlh.geturl() + new_location
1568         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1569
1570         title = self._html_search_regex(r'<title>(.*)</title>',
1571             webpage, u'title').split('/')[0].strip()
1572
1573         ext = "flv"
1574         info_url = "http://vbox7.com/play/magare.do"
1575         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1576         info_request = compat_urllib_request.Request(info_url, data)
1577         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1578         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1579         if info_response is None:
1580             raise ExtractorError(u'Unable to extract the media url')
1581         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1582
1583         return [{
1584             'id':        video_id,
1585             'url':       final_url,
1586             'ext':       ext,
1587             'title':     title,
1588             'thumbnail': thumbnail_url,
1589         }]
1590
1591
1592 def gen_extractors():
1593     """ Return a list of an instance of every supported extractor.
1594     The order does matter; the first extractor matched is the one handling the URL.
1595     """
1596     return [
1597         YoutubePlaylistIE(),
1598         YoutubeChannelIE(),
1599         YoutubeUserIE(),
1600         YoutubeSearchIE(),
1601         YoutubeIE(),
1602         MetacafeIE(),
1603         DailymotionIE(),
1604         GoogleSearchIE(),
1605         PhotobucketIE(),
1606         YahooIE(),
1607         YahooSearchIE(),
1608         DepositFilesIE(),
1609         FacebookIE(),
1610         BlipTVIE(),
1611         BlipTVUserIE(),
1612         VimeoIE(),
1613         MyVideoIE(),
1614         ComedyCentralIE(),
1615         EscapistIE(),
1616         CollegeHumorIE(),
1617         XVideosIE(),
1618         SoundcloudSetIE(),
1619         SoundcloudIE(),
1620         InfoQIE(),
1621         MixcloudIE(),
1622         StanfordOpenClassroomIE(),
1623         MTVIE(),
1624         YoukuIE(),
1625         XNXXIE(),
1626         YouJizzIE(),
1627         PornotubeIE(),
1628         YouPornIE(),
1629         GooglePlusIE(),
1630         ArteTvIE(),
1631         NBAIE(),
1632         WorldStarHipHopIE(),
1633         JustinTVIE(),
1634         FunnyOrDieIE(),
1635         SteamIE(),
1636         UstreamIE(),
1637         RBMARadioIE(),
1638         EightTracksIE(),
1639         KeekIE(),
1640         TEDIE(),
1641         MySpassIE(),
1642         SpiegelIE(),
1643         LiveLeakIE(),
1644         ARDIE(),
1645         ZDFIE(),
1646         TumblrIE(),
1647         BandcampIE(),
1648         RedTubeIE(),
1649         InaIE(),
1650         HowcastIE(),
1651         VineIE(),
1652         FlickrIE(),
1653         TeamcocoIE(),
1654         XHamsterIE(),
1655         HypemIE(),
1656         Vbox7IE(),
1657         GametrailersIE(),
1658         StatigramIE(),
1659         GenericIE()
1660     ]
1661
1662 def get_info_extractor(ie_name):
1663     """Returns the info extractor class with the given ie_name"""
1664     return globals()[ie_name+'IE']