10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.gametrailers import GametrailersIE
27 from .extractor.generic import GenericIE
28 from .extractor.googleplus import GooglePlusIE
29 from .extractor.googlesearch import GoogleSearchIE
30 from .extractor.metacafe import MetacafeIE
31 from .extractor.myvideo import MyVideoIE
32 from .extractor.statigram import StatigramIE
33 from .extractor.photobucket import PhotobucketIE
34 from .extractor.vimeo import VimeoIE
35 from .extractor.yahoo import YahooIE, YahooSearchIE
36 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
37 from .extractor.zdf import ZDFIE
57 class DepositFilesIE(InfoExtractor):
58 """Information extractor for depositfiles.com"""
60 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
62 def _real_extract(self, url):
63 file_id = url.split('/')[-1]
64 # Rebuild url in english locale
65 url = 'http://depositfiles.com/en/files/' + file_id
67 # Retrieve file webpage with 'Free download' button pressed
68 free_download_indication = { 'gateway_result' : '1' }
69 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
71 self.report_download_webpage(file_id)
72 webpage = compat_urllib_request.urlopen(request).read()
73 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
76 # Search for the real file URL
77 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
78 if (mobj is None) or (mobj.group(1) is None):
79 # Try to figure out reason of the error.
80 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
81 if (mobj is not None) and (mobj.group(1) is not None):
82 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
83 raise ExtractorError(u'%s' % restriction_message)
85 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
87 file_url = mobj.group(1)
88 file_extension = os.path.splitext(file_url)[1][1:]
90 # Search for file title
91 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
94 'id': file_id.decode('utf-8'),
95 'url': file_url.decode('utf-8'),
99 'ext': file_extension.decode('utf-8'),
103 class FacebookIE(InfoExtractor):
104 """Information Extractor for Facebook"""
106 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
107 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
108 _NETRC_MACHINE = 'facebook'
109 IE_NAME = u'facebook'
111 def report_login(self):
112 """Report attempt to log in."""
113 self.to_screen(u'Logging in')
115 def _real_initialize(self):
116 if self._downloader is None:
121 downloader_params = self._downloader.params
123 # Attempt to use provided username and password or .netrc data
124 if downloader_params.get('username', None) is not None:
125 useremail = downloader_params['username']
126 password = downloader_params['password']
127 elif downloader_params.get('usenetrc', False):
129 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
134 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
135 except (IOError, netrc.NetrcParseError) as err:
136 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
139 if useremail is None:
148 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
151 login_results = compat_urllib_request.urlopen(request).read()
152 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
153 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
155 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
156 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
159 def _real_extract(self, url):
160 mobj = re.match(self._VALID_URL, url)
162 raise ExtractorError(u'Invalid URL: %s' % url)
163 video_id = mobj.group('ID')
165 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
166 webpage = self._download_webpage(url, video_id)
168 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
169 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
170 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
172 raise ExtractorError(u'Cannot parse data')
173 data = dict(json.loads(m.group(1)))
174 params_raw = compat_urllib_parse.unquote(data['params'])
175 params = json.loads(params_raw)
176 video_data = params['video_data'][0]
177 video_url = video_data.get('hd_src')
179 video_url = video_data['sd_src']
181 raise ExtractorError(u'Cannot find video URL')
182 video_duration = int(video_data['video_duration'])
183 thumbnail = video_data['thumbnail_src']
185 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
190 'title': video_title,
193 'duration': video_duration,
194 'thumbnail': thumbnail,
204 class EscapistIE(InfoExtractor):
205 """Information extractor for The Escapist """
207 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
208 IE_NAME = u'escapist'
210 def _real_extract(self, url):
211 mobj = re.match(self._VALID_URL, url)
213 raise ExtractorError(u'Invalid URL: %s' % url)
214 showName = mobj.group('showname')
215 videoId = mobj.group('episode')
217 self.report_extraction(videoId)
218 webpage = self._download_webpage(url, videoId)
220 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
221 webpage, u'description', fatal=False)
223 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
224 webpage, u'thumbnail', fatal=False)
226 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
227 webpage, u'player url')
229 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
230 webpage, u'player url').split(' : ')[-1]
232 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
233 configUrl = compat_urllib_parse.unquote(configUrl)
235 configJSON = self._download_webpage(configUrl, videoId,
236 u'Downloading configuration',
237 u'unable to download configuration')
239 # Technically, it's JavaScript, not JSON
240 configJSON = configJSON.replace("'", '"')
243 config = json.loads(configJSON)
244 except (ValueError,) as err:
245 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
247 playlist = config['playlist']
248 videoUrl = playlist[1]['url']
253 'uploader': showName,
258 'description': videoDesc,
259 'player_url': playerUrl,
264 class CollegeHumorIE(InfoExtractor):
265 """Information extractor for collegehumor.com"""
268 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
269 IE_NAME = u'collegehumor'
271 def report_manifest(self, video_id):
272 """Report information extraction."""
273 self.to_screen(u'%s: Downloading XML manifest' % video_id)
275 def _real_extract(self, url):
276 mobj = re.match(self._VALID_URL, url)
278 raise ExtractorError(u'Invalid URL: %s' % url)
279 video_id = mobj.group('videoid')
287 self.report_extraction(video_id)
288 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
290 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
291 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
292 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
294 mdoc = xml.etree.ElementTree.fromstring(metaXml)
296 videoNode = mdoc.findall('./video')[0]
297 info['description'] = videoNode.findall('./description')[0].text
298 info['title'] = videoNode.findall('./caption')[0].text
299 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
300 manifest_url = videoNode.findall('./file')[0].text
302 raise ExtractorError(u'Invalid metadata XML file')
304 manifest_url += '?hdcore=2.10.3'
305 self.report_manifest(video_id)
307 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
309 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
311 adoc = xml.etree.ElementTree.fromstring(manifestXml)
313 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
314 node_id = media_node.attrib['url']
315 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
316 except IndexError as err:
317 raise ExtractorError(u'Invalid manifest file')
319 url_pr = compat_urllib_parse_urlparse(manifest_url)
320 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
327 class XVideosIE(InfoExtractor):
328 """Information extractor for xvideos.com"""
330 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
333 def _real_extract(self, url):
334 mobj = re.match(self._VALID_URL, url)
336 raise ExtractorError(u'Invalid URL: %s' % url)
337 video_id = mobj.group(1)
339 webpage = self._download_webpage(url, video_id)
341 self.report_extraction(video_id)
344 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
345 webpage, u'video URL'))
348 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
351 # Extract video thumbnail
352 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
353 webpage, u'thumbnail', fatal=False)
360 'title': video_title,
362 'thumbnail': video_thumbnail,
369 class SoundcloudIE(InfoExtractor):
370 """Information extractor for soundcloud.com
371 To access the media, the uid of the song and a stream token
372 must be extracted from the page source and the script must make
373 a request to media.soundcloud.com/crossdomain.xml. Then
374 the media can be grabbed by requesting from an url composed
375 of the stream token and uid
378 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
379 IE_NAME = u'soundcloud'
381 def report_resolve(self, video_id):
382 """Report information extraction."""
383 self.to_screen(u'%s: Resolving id' % video_id)
385 def _real_extract(self, url):
386 mobj = re.match(self._VALID_URL, url)
388 raise ExtractorError(u'Invalid URL: %s' % url)
390 # extract uploader (which is in the url)
391 uploader = mobj.group(1)
392 # extract simple title (uploader + slug of song title)
393 slug_title = mobj.group(2)
394 simple_title = uploader + u'-' + slug_title
395 full_title = '%s/%s' % (uploader, slug_title)
397 self.report_resolve(full_title)
399 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
400 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
401 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
403 info = json.loads(info_json)
404 video_id = info['id']
405 self.report_extraction(full_title)
407 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
408 stream_json = self._download_webpage(streams_url, full_title,
409 u'Downloading stream definitions',
410 u'unable to download stream definitions')
412 streams = json.loads(stream_json)
413 mediaURL = streams['http_mp3_128_url']
414 upload_date = unified_strdate(info['created_at'])
419 'uploader': info['user']['username'],
420 'upload_date': upload_date,
421 'title': info['title'],
423 'description': info['description'],
426 class SoundcloudSetIE(InfoExtractor):
427 """Information extractor for soundcloud.com sets
428 To access the media, the uid of the song and a stream token
429 must be extracted from the page source and the script must make
430 a request to media.soundcloud.com/crossdomain.xml. Then
431 the media can be grabbed by requesting from an url composed
432 of the stream token and uid
435 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
436 IE_NAME = u'soundcloud:set'
438 def report_resolve(self, video_id):
439 """Report information extraction."""
440 self.to_screen(u'%s: Resolving id' % video_id)
442 def _real_extract(self, url):
443 mobj = re.match(self._VALID_URL, url)
445 raise ExtractorError(u'Invalid URL: %s' % url)
447 # extract uploader (which is in the url)
448 uploader = mobj.group(1)
449 # extract simple title (uploader + slug of song title)
450 slug_title = mobj.group(2)
451 simple_title = uploader + u'-' + slug_title
452 full_title = '%s/sets/%s' % (uploader, slug_title)
454 self.report_resolve(full_title)
456 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
457 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
458 info_json = self._download_webpage(resolv_url, full_title)
461 info = json.loads(info_json)
463 for err in info['errors']:
464 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
467 self.report_extraction(full_title)
468 for track in info['tracks']:
469 video_id = track['id']
471 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
472 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
474 self.report_extraction(video_id)
475 streams = json.loads(stream_json)
476 mediaURL = streams['http_mp3_128_url']
481 'uploader': track['user']['username'],
482 'upload_date': unified_strdate(track['created_at']),
483 'title': track['title'],
485 'description': track['description'],
490 class InfoQIE(InfoExtractor):
491 """Information extractor for infoq.com"""
492 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
494 def _real_extract(self, url):
495 mobj = re.match(self._VALID_URL, url)
497 raise ExtractorError(u'Invalid URL: %s' % url)
499 webpage = self._download_webpage(url, video_id=url)
500 self.report_extraction(url)
503 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
505 raise ExtractorError(u'Unable to extract video url')
506 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
507 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
510 video_title = self._search_regex(r'contentTitle = "(.*?)";',
513 # Extract description
514 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
515 webpage, u'description', fatal=False)
517 video_filename = video_url.split('/')[-1]
518 video_id, extension = video_filename.split('.')
525 'title': video_title,
526 'ext': extension, # Extension is always(?) mp4, but seems to be flv
528 'description': video_description,
533 class MixcloudIE(InfoExtractor):
534 """Information extractor for www.mixcloud.com"""
536 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
537 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
538 IE_NAME = u'mixcloud'
540 def report_download_json(self, file_id):
541 """Report JSON download."""
542 self.to_screen(u'Downloading json')
544 def get_urls(self, jsonData, fmt, bitrate='best'):
545 """Get urls from 'audio_formats' section in json"""
548 bitrate_list = jsonData[fmt]
549 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
550 bitrate = max(bitrate_list) # select highest
552 url_list = jsonData[fmt][bitrate]
553 except TypeError: # we have no bitrate info.
554 url_list = jsonData[fmt]
557 def check_urls(self, url_list):
558 """Returns 1st active url from list"""
561 compat_urllib_request.urlopen(url)
563 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
568 def _print_formats(self, formats):
569 print('Available formats:')
570 for fmt in formats.keys():
571 for b in formats[fmt]:
573 ext = formats[fmt][b][0]
574 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
575 except TypeError: # we have no bitrate info
576 ext = formats[fmt][0]
577 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
580 def _real_extract(self, url):
581 mobj = re.match(self._VALID_URL, url)
583 raise ExtractorError(u'Invalid URL: %s' % url)
584 # extract uploader & filename from url
585 uploader = mobj.group(1).decode('utf-8')
586 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
588 # construct API request
589 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
590 # retrieve .json file with links to files
591 request = compat_urllib_request.Request(file_url)
593 self.report_download_json(file_url)
594 jsonData = compat_urllib_request.urlopen(request).read()
595 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
596 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
599 json_data = json.loads(jsonData)
600 player_url = json_data['player_swf_url']
601 formats = dict(json_data['audio_formats'])
603 req_format = self._downloader.params.get('format', None)
606 if self._downloader.params.get('listformats', None):
607 self._print_formats(formats)
610 if req_format is None or req_format == 'best':
611 for format_param in formats.keys():
612 url_list = self.get_urls(formats, format_param)
614 file_url = self.check_urls(url_list)
615 if file_url is not None:
618 if req_format not in formats:
619 raise ExtractorError(u'Format is not available')
621 url_list = self.get_urls(formats, req_format)
622 file_url = self.check_urls(url_list)
623 format_param = req_format
626 'id': file_id.decode('utf-8'),
627 'url': file_url.decode('utf-8'),
628 'uploader': uploader.decode('utf-8'),
630 'title': json_data['name'],
631 'ext': file_url.split('.')[-1].decode('utf-8'),
632 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
633 'thumbnail': json_data['thumbnail_url'],
634 'description': json_data['description'],
635 'player_url': player_url.decode('utf-8'),
638 class StanfordOpenClassroomIE(InfoExtractor):
639 """Information extractor for Stanford's Open ClassRoom"""
641 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
642 IE_NAME = u'stanfordoc'
644 def _real_extract(self, url):
645 mobj = re.match(self._VALID_URL, url)
647 raise ExtractorError(u'Invalid URL: %s' % url)
649 if mobj.group('course') and mobj.group('video'): # A specific video
650 course = mobj.group('course')
651 video = mobj.group('video')
653 'id': course + '_' + video,
658 self.report_extraction(info['id'])
659 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
660 xmlUrl = baseUrl + video + '.xml'
662 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
663 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
665 mdoc = xml.etree.ElementTree.fromstring(metaXml)
667 info['title'] = mdoc.findall('./title')[0].text
668 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
670 raise ExtractorError(u'Invalid metadata XML file')
671 info['ext'] = info['url'].rpartition('.')[2]
673 elif mobj.group('course'): # A course page
674 course = mobj.group('course')
682 coursepage = self._download_webpage(url, info['id'],
683 note='Downloading course info page',
684 errnote='Unable to download course info page')
686 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
688 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
689 coursepage, u'description', fatal=False)
691 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
695 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
699 for entry in info['list']:
700 assert entry['type'] == 'reference'
701 results += self.extract(entry['url'])
705 'id': 'Stanford OpenClassroom',
711 self.report_download_webpage(info['id'])
712 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
714 rootpage = compat_urllib_request.urlopen(rootURL).read()
715 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
716 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
718 info['title'] = info['id']
720 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
724 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
729 for entry in info['list']:
730 assert entry['type'] == 'reference'
731 results += self.extract(entry['url'])
734 class MTVIE(InfoExtractor):
735 """Information extractor for MTV.com"""
737 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
740 def _real_extract(self, url):
741 mobj = re.match(self._VALID_URL, url)
743 raise ExtractorError(u'Invalid URL: %s' % url)
744 if not mobj.group('proto'):
745 url = 'http://' + url
746 video_id = mobj.group('videoid')
748 webpage = self._download_webpage(url, video_id)
750 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
751 webpage, u'song name', fatal=False)
753 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
756 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
757 webpage, u'mtvn_uri', fatal=False)
759 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
760 webpage, u'content id', fatal=False)
762 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
763 self.report_extraction(video_id)
764 request = compat_urllib_request.Request(videogen_url)
766 metadataXml = compat_urllib_request.urlopen(request).read()
767 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
768 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
770 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
771 renditions = mdoc.findall('.//rendition')
773 # For now, always pick the highest quality.
774 rendition = renditions[-1]
777 _,_,ext = rendition.attrib['type'].partition('/')
778 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
779 video_url = rendition.find('./src').text
781 raise ExtractorError('Invalid rendition field.')
786 'uploader': performer,
788 'title': video_title,
796 class YoukuIE(InfoExtractor):
797 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
800 nowTime = int(time.time() * 1000)
801 random1 = random.randint(1000,1998)
802 random2 = random.randint(1000,9999)
804 return "%d%d%d" %(nowTime,random1,random2)
806 def _get_file_ID_mix_string(self, seed):
808 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
810 for i in range(len(source)):
811 seed = (seed * 211 + 30031 ) % 65536
812 index = math.floor(seed / 65536 * len(source) )
813 mixed.append(source[int(index)])
814 source.remove(source[int(index)])
815 #return ''.join(mixed)
818 def _get_file_id(self, fileId, seed):
819 mixed = self._get_file_ID_mix_string(seed)
820 ids = fileId.split('*')
824 realId.append(mixed[int(ch)])
825 return ''.join(realId)
827 def _real_extract(self, url):
828 mobj = re.match(self._VALID_URL, url)
830 raise ExtractorError(u'Invalid URL: %s' % url)
831 video_id = mobj.group('ID')
833 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
835 jsondata = self._download_webpage(info_url, video_id)
837 self.report_extraction(video_id)
839 config = json.loads(jsondata)
841 video_title = config['data'][0]['title']
842 seed = config['data'][0]['seed']
844 format = self._downloader.params.get('format', None)
845 supported_format = list(config['data'][0]['streamfileids'].keys())
847 if format is None or format == 'best':
848 if 'hd2' in supported_format:
853 elif format == 'worst':
861 fileid = config['data'][0]['streamfileids'][format]
862 keys = [s['k'] for s in config['data'][0]['segs'][format]]
863 except (UnicodeDecodeError, ValueError, KeyError):
864 raise ExtractorError(u'Unable to extract info section')
867 sid = self._gen_sid()
868 fileid = self._get_file_id(fileid, seed)
870 #column 8,9 of fileid represent the segment number
871 #fileid[7:9] should be changed
872 for index, key in enumerate(keys):
874 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
875 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
878 'id': '%s_part%02d' % (video_id, index),
882 'title': video_title,
885 files_info.append(info)
890 class XNXXIE(InfoExtractor):
891 """Information extractor for xnxx.com"""
893 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
895 VIDEO_URL_RE = r'flv_url=(.*?)&'
896 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
897 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
899 def _real_extract(self, url):
900 mobj = re.match(self._VALID_URL, url)
902 raise ExtractorError(u'Invalid URL: %s' % url)
903 video_id = mobj.group(1)
905 # Get webpage content
906 webpage = self._download_webpage(url, video_id)
908 video_url = self._search_regex(self.VIDEO_URL_RE,
909 webpage, u'video URL')
910 video_url = compat_urllib_parse.unquote(video_url)
912 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
915 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
916 webpage, u'thumbnail', fatal=False)
923 'title': video_title,
925 'thumbnail': video_thumbnail,
931 class NBAIE(InfoExtractor):
932 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
935 def _real_extract(self, url):
936 mobj = re.match(self._VALID_URL, url)
938 raise ExtractorError(u'Invalid URL: %s' % url)
940 video_id = mobj.group(1)
942 webpage = self._download_webpage(url, video_id)
944 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
946 shortened_video_id = video_id.rpartition('/')[2]
947 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
948 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
950 # It isn't there in the HTML it returns to us
951 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
953 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
956 'id': shortened_video_id,
960 # 'uploader_date': uploader_date,
961 'description': description,
965 class JustinTVIE(InfoExtractor):
966 """Information extractor for justin.tv and twitch.tv"""
967 # TODO: One broadcast may be split into multiple videos. The key
968 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
969 # starts at 1 and increases. Can we treat all parts as one video?
971 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
973 (?P<channelid>[^/]+)|
974 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
975 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
979 _JUSTIN_PAGE_LIMIT = 100
980 IE_NAME = u'justin.tv'
982 def report_download_page(self, channel, offset):
983 """Report attempt to download a single page of videos."""
984 self.to_screen(u'%s: Downloading video information from %d to %d' %
985 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
987 # Return count of items, list of *valid* items
988 def _parse_page(self, url, video_id):
989 webpage = self._download_webpage(url, video_id,
990 u'Downloading video info JSON',
991 u'unable to download video info JSON')
993 response = json.loads(webpage)
994 if type(response) != list:
995 error_text = response.get('error', 'unknown error')
996 raise ExtractorError(u'Justin.tv API: %s' % error_text)
998 for clip in response:
999 video_url = clip['video_file_url']
1001 video_extension = os.path.splitext(video_url)[1][1:]
1002 video_date = re.sub('-', '', clip['start_time'][:10])
1003 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1004 video_id = clip['id']
1005 video_title = clip.get('title', video_id)
1009 'title': video_title,
1010 'uploader': clip.get('channel_name', video_uploader_id),
1011 'uploader_id': video_uploader_id,
1012 'upload_date': video_date,
1013 'ext': video_extension,
1015 return (len(response), info)
1017 def _real_extract(self, url):
1018 mobj = re.match(self._VALID_URL, url)
1020 raise ExtractorError(u'invalid URL: %s' % url)
1022 api_base = 'http://api.justin.tv'
1024 if mobj.group('channelid'):
1026 video_id = mobj.group('channelid')
1027 api = api_base + '/channel/archives/%s.json' % video_id
1028 elif mobj.group('chapterid'):
1029 chapter_id = mobj.group('chapterid')
1031 webpage = self._download_webpage(url, chapter_id)
1032 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1034 raise ExtractorError(u'Cannot find archive of a chapter')
1035 archive_id = m.group(1)
1037 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1038 chapter_info_xml = self._download_webpage(api, chapter_id,
1039 note=u'Downloading chapter information',
1040 errnote=u'Chapter information download failed')
1041 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1042 for a in doc.findall('.//archive'):
1043 if archive_id == a.find('./id').text:
1046 raise ExtractorError(u'Could not find chapter in chapter information')
1048 video_url = a.find('./video_file_url').text
1049 video_ext = video_url.rpartition('.')[2] or u'flv'
1051 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1052 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1053 note='Downloading chapter metadata',
1054 errnote='Download of chapter metadata failed')
1055 chapter_info = json.loads(chapter_info_json)
1057 bracket_start = int(doc.find('.//bracket_start').text)
1058 bracket_end = int(doc.find('.//bracket_end').text)
1060 # TODO determine start (and probably fix up file)
1061 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1062 #video_url += u'?start=' + TODO:start_timestamp
1063 # bracket_start is 13290, but we want 51670615
1064 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1065 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1068 'id': u'c' + chapter_id,
1071 'title': chapter_info['title'],
1072 'thumbnail': chapter_info['preview'],
1073 'description': chapter_info['description'],
1074 'uploader': chapter_info['channel']['display_name'],
1075 'uploader_id': chapter_info['channel']['name'],
1079 video_id = mobj.group('videoid')
1080 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1082 self.report_extraction(video_id)
1086 limit = self._JUSTIN_PAGE_LIMIT
1089 self.report_download_page(video_id, offset)
1090 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1091 page_count, page_info = self._parse_page(page_url, video_id)
1092 info.extend(page_info)
1093 if not paged or page_count != limit:
1098 class FunnyOrDieIE(InfoExtractor):
1099 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1101 def _real_extract(self, url):
1102 mobj = re.match(self._VALID_URL, url)
1104 raise ExtractorError(u'invalid URL: %s' % url)
1106 video_id = mobj.group('id')
1107 webpage = self._download_webpage(url, video_id)
1109 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1110 webpage, u'video URL', flags=re.DOTALL)
1112 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1113 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1115 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1116 webpage, u'description', fatal=False, flags=re.DOTALL)
1123 'description': video_description,
1127 class SteamIE(InfoExtractor):
1128 _VALID_URL = r"""http://store\.steampowered\.com/
1130 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1132 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1134 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1135 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1138 def suitable(cls, url):
1139 """Receives a URL and returns True if suitable for this IE."""
1140 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1142 def _real_extract(self, url):
1143 m = re.match(self._VALID_URL, url, re.VERBOSE)
1144 gameID = m.group('gameID')
1146 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1147 webpage = self._download_webpage(videourl, gameID)
1149 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1150 videourl = self._AGECHECK_TEMPLATE % gameID
1151 self.report_age_confirmation()
1152 webpage = self._download_webpage(videourl, gameID)
1154 self.report_extraction(gameID)
1155 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1156 webpage, 'game title')
1158 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1159 mweb = re.finditer(urlRE, webpage)
1160 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1161 titles = re.finditer(namesRE, webpage)
1162 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1163 thumbs = re.finditer(thumbsRE, webpage)
1165 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1166 video_id = vid.group('videoID')
1167 title = vtitle.group('videoName')
1168 video_url = vid.group('videoURL')
1169 video_thumb = thumb.group('thumbnail')
1171 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1176 'title': unescapeHTML(title),
1177 'thumbnail': video_thumb
1180 return [self.playlist_result(videos, gameID, game_title)]
1182 class UstreamIE(InfoExtractor):
1183 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1184 IE_NAME = u'ustream'
1186 def _real_extract(self, url):
1187 m = re.match(self._VALID_URL, url)
1188 video_id = m.group('videoID')
1190 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1191 webpage = self._download_webpage(url, video_id)
1193 self.report_extraction(video_id)
1195 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1198 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1199 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1201 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1202 webpage, u'thumbnail', fatal=False)
1208 'title': video_title,
1209 'uploader': uploader,
1210 'thumbnail': thumbnail,
1214 class WorldStarHipHopIE(InfoExtractor):
1215 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1216 IE_NAME = u'WorldStarHipHop'
1218 def _real_extract(self, url):
1219 m = re.match(self._VALID_URL, url)
1220 video_id = m.group('id')
1222 webpage_src = self._download_webpage(url, video_id)
1224 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1225 webpage_src, u'video URL')
1227 if 'mp4' in video_url:
1232 video_title = self._html_search_regex(r"<title>(.*)</title>",
1233 webpage_src, u'title')
1235 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1236 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1237 webpage_src, u'thumbnail', fatal=False)
1240 _title = r"""candytitles.*>(.*)</span>"""
1241 mobj = re.search(_title, webpage_src)
1242 if mobj is not None:
1243 video_title = mobj.group(1)
1248 'title' : video_title,
1249 'thumbnail' : thumbnail,
1254 class RBMARadioIE(InfoExtractor):
1255 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1257 def _real_extract(self, url):
1258 m = re.match(self._VALID_URL, url)
1259 video_id = m.group('videoID')
1261 webpage = self._download_webpage(url, video_id)
1263 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1264 webpage, u'json data', flags=re.MULTILINE)
1267 data = json.loads(json_data)
1268 except ValueError as e:
1269 raise ExtractorError(u'Invalid JSON: ' + str(e))
1271 video_url = data['akamai_url'] + '&cbr=256'
1272 url_parts = compat_urllib_parse_urlparse(video_url)
1273 video_ext = url_parts.path.rpartition('.')[2]
1278 'title': data['title'],
1279 'description': data.get('teaser_text'),
1280 'location': data.get('country_of_origin'),
1281 'uploader': data.get('host', {}).get('name'),
1282 'uploader_id': data.get('host', {}).get('slug'),
1283 'thumbnail': data.get('image', {}).get('large_url_2x'),
1284 'duration': data.get('duration'),
1289 class YouPornIE(InfoExtractor):
1290 """Information extractor for youporn.com."""
1291 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1293 def _print_formats(self, formats):
1294 """Print all available formats"""
1295 print(u'Available formats:')
1296 print(u'ext\t\tformat')
1297 print(u'---------------------------------')
1298 for format in formats:
1299 print(u'%s\t\t%s' % (format['ext'], format['format']))
1301 def _specific(self, req_format, formats):
1303 if(x["format"]==req_format):
1307 def _real_extract(self, url):
1308 mobj = re.match(self._VALID_URL, url)
1310 raise ExtractorError(u'Invalid URL: %s' % url)
1311 video_id = mobj.group('videoid')
1313 req = compat_urllib_request.Request(url)
1314 req.add_header('Cookie', 'age_verified=1')
1315 webpage = self._download_webpage(req, video_id)
1317 # Get JSON parameters
1318 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1320 params = json.loads(json_params)
1322 raise ExtractorError(u'Invalid JSON')
1324 self.report_extraction(video_id)
1326 video_title = params['title']
1327 upload_date = unified_strdate(params['release_date_f'])
1328 video_description = params['description']
1329 video_uploader = params['submitted_by']
1330 thumbnail = params['thumbnails'][0]['image']
1332 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1334 # Get all of the formats available
1335 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1336 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1337 webpage, u'download list').strip()
1339 # Get all of the links from the page
1340 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1341 links = re.findall(LINK_RE, download_list_html)
1342 if(len(links) == 0):
1343 raise ExtractorError(u'ERROR: no known formats available for video')
1345 self.to_screen(u'Links found: %d' % len(links))
1350 # A link looks like this:
1351 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1352 # A path looks like this:
1353 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1354 video_url = unescapeHTML( link )
1355 path = compat_urllib_parse_urlparse( video_url ).path
1356 extension = os.path.splitext( path )[1][1:]
1357 format = path.split('/')[4].split('_')[:2]
1360 format = "-".join( format )
1361 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1366 'uploader': video_uploader,
1367 'upload_date': upload_date,
1368 'title': video_title,
1371 'thumbnail': thumbnail,
1372 'description': video_description
1375 if self._downloader.params.get('listformats', None):
1376 self._print_formats(formats)
1379 req_format = self._downloader.params.get('format', None)
1380 self.to_screen(u'Format: %s' % req_format)
1382 if req_format is None or req_format == 'best':
1384 elif req_format == 'worst':
1385 return [formats[-1]]
1386 elif req_format in ('-1', 'all'):
1389 format = self._specific( req_format, formats )
1391 raise ExtractorError(u'Requested format not available')
1396 class PornotubeIE(InfoExtractor):
1397 """Information extractor for pornotube.com."""
1398 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1400 def _real_extract(self, url):
1401 mobj = re.match(self._VALID_URL, url)
1403 raise ExtractorError(u'Invalid URL: %s' % url)
1405 video_id = mobj.group('videoid')
1406 video_title = mobj.group('title')
1408 # Get webpage content
1409 webpage = self._download_webpage(url, video_id)
1412 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1413 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1414 video_url = compat_urllib_parse.unquote(video_url)
1416 #Get the uploaded date
1417 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1418 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1419 if upload_date: upload_date = unified_strdate(upload_date)
1421 info = {'id': video_id,
1424 'upload_date': upload_date,
1425 'title': video_title,
1431 class YouJizzIE(InfoExtractor):
1432 """Information extractor for youjizz.com."""
1433 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1435 def _real_extract(self, url):
1436 mobj = re.match(self._VALID_URL, url)
1438 raise ExtractorError(u'Invalid URL: %s' % url)
1440 video_id = mobj.group('videoid')
1442 # Get webpage content
1443 webpage = self._download_webpage(url, video_id)
1445 # Get the video title
1446 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1447 webpage, u'title').strip()
1449 # Get the embed page
1450 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1452 raise ExtractorError(u'ERROR: unable to extract embed page')
1454 embed_page_url = result.group(0).strip()
1455 video_id = result.group('videoid')
1457 webpage = self._download_webpage(embed_page_url, video_id)
1460 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1461 webpage, u'video URL')
1463 info = {'id': video_id,
1465 'title': video_title,
1468 'player_url': embed_page_url}
1472 class EightTracksIE(InfoExtractor):
1474 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1476 def _real_extract(self, url):
1477 mobj = re.match(self._VALID_URL, url)
1479 raise ExtractorError(u'Invalid URL: %s' % url)
1480 playlist_id = mobj.group('id')
1482 webpage = self._download_webpage(url, playlist_id)
1484 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1485 data = json.loads(json_like)
1487 session = str(random.randint(0, 1000000000))
1489 track_count = data['tracks_count']
1490 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1491 next_url = first_url
1493 for i in itertools.count():
1494 api_json = self._download_webpage(next_url, playlist_id,
1495 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1496 errnote=u'Failed to download song information')
1497 api_data = json.loads(api_json)
1498 track_data = api_data[u'set']['track']
1500 'id': track_data['id'],
1501 'url': track_data['track_file_stream_url'],
1502 'title': track_data['performer'] + u' - ' + track_data['name'],
1503 'raw_title': track_data['name'],
1504 'uploader_id': data['user']['login'],
1508 if api_data['set']['at_last_track']:
1510 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1513 class KeekIE(InfoExtractor):
1514 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1517 def _real_extract(self, url):
1518 m = re.match(self._VALID_URL, url)
1519 video_id = m.group('videoID')
1521 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1522 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1523 webpage = self._download_webpage(url, video_id)
1525 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1528 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1529 webpage, u'uploader', fatal=False)
1535 'title': video_title,
1536 'thumbnail': thumbnail,
1537 'uploader': uploader
1541 class TEDIE(InfoExtractor):
1542 _VALID_URL=r'''http://www\.ted\.com/
1544 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1546 ((?P<type_talk>talks)) # We have a simple talk
1548 (/lang/(.*?))? # The url may contain the language
1549 /(?P<name>\w+) # Here goes the name and then ".html"
1553 def suitable(cls, url):
1554 """Receives a URL and returns True if suitable for this IE."""
1555 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1557 def _real_extract(self, url):
1558 m=re.match(self._VALID_URL, url, re.VERBOSE)
1559 if m.group('type_talk'):
1560 return [self._talk_info(url)]
1562 playlist_id=m.group('playlist_id')
1563 name=m.group('name')
1564 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1565 return [self._playlist_videos_info(url,name,playlist_id)]
1567 def _playlist_videos_info(self,url,name,playlist_id=0):
1568 '''Returns the videos of the playlist'''
1570 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1571 ([.\s]*?)data-playlist_item_id="(\d+)"
1572 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1574 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1575 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1576 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1577 m_names=re.finditer(video_name_RE,webpage)
1579 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1580 webpage, 'playlist title')
1582 playlist_entries = []
1583 for m_video, m_name in zip(m_videos,m_names):
1584 video_id=m_video.group('video_id')
1585 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1586 playlist_entries.append(self.url_result(talk_url, 'TED'))
1587 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1589 def _talk_info(self, url, video_id=0):
1590 """Return the video for the talk in the url"""
1591 m = re.match(self._VALID_URL, url,re.VERBOSE)
1592 video_name = m.group('name')
1593 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1594 self.report_extraction(video_name)
1595 # If the url includes the language we get the title translated
1596 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1598 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1599 webpage, 'json data')
1600 info = json.loads(json_data)
1601 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1602 webpage, 'description', flags = re.DOTALL)
1604 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1605 webpage, 'thumbnail')
1608 'url': info['htmlStreams'][-1]['file'],
1611 'thumbnail': thumbnail,
1612 'description': desc,
1616 class MySpassIE(InfoExtractor):
1617 _VALID_URL = r'http://www.myspass.de/.*'
1619 def _real_extract(self, url):
1620 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1622 # video id is the last path element of the URL
1623 # usually there is a trailing slash, so also try the second but last
1624 url_path = compat_urllib_parse_urlparse(url).path
1625 url_parent_path, video_id = os.path.split(url_path)
1627 _, video_id = os.path.split(url_parent_path)
1630 metadata_url = META_DATA_URL_TEMPLATE % video_id
1631 metadata_text = self._download_webpage(metadata_url, video_id)
1632 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1634 # extract values from metadata
1635 url_flv_el = metadata.find('url_flv')
1636 if url_flv_el is None:
1637 raise ExtractorError(u'Unable to extract download url')
1638 video_url = url_flv_el.text
1639 extension = os.path.splitext(video_url)[1][1:]
1640 title_el = metadata.find('title')
1641 if title_el is None:
1642 raise ExtractorError(u'Unable to extract title')
1643 title = title_el.text
1644 format_id_el = metadata.find('format_id')
1645 if format_id_el is None:
1648 format = format_id_el.text
1649 description_el = metadata.find('description')
1650 if description_el is not None:
1651 description = description_el.text
1654 imagePreview_el = metadata.find('imagePreview')
1655 if imagePreview_el is not None:
1656 thumbnail = imagePreview_el.text
1665 'thumbnail': thumbnail,
1666 'description': description
1670 class SpiegelIE(InfoExtractor):
1671 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1673 def _real_extract(self, url):
1674 m = re.match(self._VALID_URL, url)
1675 video_id = m.group('videoID')
1677 webpage = self._download_webpage(url, video_id)
1679 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1682 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1683 xml_code = self._download_webpage(xml_url, video_id,
1684 note=u'Downloading XML', errnote=u'Failed to download XML')
1686 idoc = xml.etree.ElementTree.fromstring(xml_code)
1687 last_type = idoc[-1]
1688 filename = last_type.findall('./filename')[0].text
1689 duration = float(last_type.findall('./duration')[0].text)
1691 video_url = 'http://video2.spiegel.de/flash/' + filename
1692 video_ext = filename.rpartition('.')[2]
1697 'title': video_title,
1698 'duration': duration,
1702 class LiveLeakIE(InfoExtractor):
1704 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1705 IE_NAME = u'liveleak'
1707 def _real_extract(self, url):
1708 mobj = re.match(self._VALID_URL, url)
1710 raise ExtractorError(u'Invalid URL: %s' % url)
1712 video_id = mobj.group('video_id')
1714 webpage = self._download_webpage(url, video_id)
1716 video_url = self._search_regex(r'file: "(.*?)",',
1717 webpage, u'video URL')
1719 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1720 webpage, u'title').replace('LiveLeak.com -', '').strip()
1722 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1723 webpage, u'description', fatal=False)
1725 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1726 webpage, u'uploader', fatal=False)
1732 'title': video_title,
1733 'description': video_description,
1734 'uploader': video_uploader
1741 class TumblrIE(InfoExtractor):
1742 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1744 def _real_extract(self, url):
1745 m_url = re.match(self._VALID_URL, url)
1746 video_id = m_url.group('id')
1747 blog = m_url.group('blog_name')
1749 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1750 webpage = self._download_webpage(url, video_id)
1752 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1753 video = re.search(re_video, webpage)
1755 raise ExtractorError(u'Unable to extract video')
1756 video_url = video.group('video_url')
1757 ext = video.group('ext')
1759 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1760 webpage, u'thumbnail', fatal=False) # We pick the first poster
1761 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1763 # The only place where you can get a title, it's not complete,
1764 # but searching in other places doesn't work for all videos
1765 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1766 webpage, u'title', flags=re.DOTALL)
1768 return [{'id': video_id,
1770 'title': video_title,
1771 'thumbnail': video_thumbnail,
1775 class BandcampIE(InfoExtractor):
1776 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1778 def _real_extract(self, url):
1779 mobj = re.match(self._VALID_URL, url)
1780 title = mobj.group('title')
1781 webpage = self._download_webpage(url, title)
1782 # We get the link to the free download page
1783 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1784 if m_download is None:
1785 raise ExtractorError(u'No free songs found')
1787 download_link = m_download.group(1)
1788 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1789 webpage, re.MULTILINE|re.DOTALL).group('id')
1791 download_webpage = self._download_webpage(download_link, id,
1792 'Downloading free downloads page')
1793 # We get the dictionary of the track from some javascrip code
1794 info = re.search(r'items: (.*?),$',
1795 download_webpage, re.MULTILINE).group(1)
1796 info = json.loads(info)[0]
1797 # We pick mp3-320 for now, until format selection can be easily implemented.
1798 mp3_info = info[u'downloads'][u'mp3-320']
1799 # If we try to use this url it says the link has expired
1800 initial_url = mp3_info[u'url']
1801 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1802 m_url = re.match(re_url, initial_url)
1803 #We build the url we will use to get the final track url
1804 # This url is build in Bandcamp in the script download_bunde_*.js
1805 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1806 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1807 # If we could correctly generate the .rand field the url would be
1808 #in the "download_url" key
1809 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1811 track_info = {'id':id,
1812 'title' : info[u'title'],
1815 'thumbnail' : info[u'thumb_url'],
1816 'uploader' : info[u'artist']
1821 class RedTubeIE(InfoExtractor):
1822 """Information Extractor for redtube"""
1823 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1825 def _real_extract(self,url):
1826 mobj = re.match(self._VALID_URL, url)
1828 raise ExtractorError(u'Invalid URL: %s' % url)
1830 video_id = mobj.group('id')
1831 video_extension = 'mp4'
1832 webpage = self._download_webpage(url, video_id)
1834 self.report_extraction(video_id)
1836 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1837 webpage, u'video URL')
1839 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1845 'ext': video_extension,
1846 'title': video_title,
1849 class InaIE(InfoExtractor):
1850 """Information Extractor for Ina.fr"""
1851 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1853 def _real_extract(self,url):
1854 mobj = re.match(self._VALID_URL, url)
1856 video_id = mobj.group('id')
1857 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1858 video_extension = 'mp4'
1859 webpage = self._download_webpage(mrss_url, video_id)
1861 self.report_extraction(video_id)
1863 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1864 webpage, u'video URL')
1866 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1872 'ext': video_extension,
1873 'title': video_title,
1876 class HowcastIE(InfoExtractor):
1877 """Information Extractor for Howcast.com"""
1878 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1880 def _real_extract(self, url):
1881 mobj = re.match(self._VALID_URL, url)
1883 video_id = mobj.group('id')
1884 webpage_url = 'http://www.howcast.com/videos/' + video_id
1885 webpage = self._download_webpage(webpage_url, video_id)
1887 self.report_extraction(video_id)
1889 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1890 webpage, u'video URL')
1892 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1895 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1896 webpage, u'description', fatal=False)
1898 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1899 webpage, u'thumbnail', fatal=False)
1905 'title': video_title,
1906 'description': video_description,
1907 'thumbnail': thumbnail,
1910 class VineIE(InfoExtractor):
1911 """Information Extractor for Vine.co"""
1912 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1914 def _real_extract(self, url):
1915 mobj = re.match(self._VALID_URL, url)
1917 video_id = mobj.group('id')
1918 webpage_url = 'https://vine.co/v/' + video_id
1919 webpage = self._download_webpage(webpage_url, video_id)
1921 self.report_extraction(video_id)
1923 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1924 webpage, u'video URL')
1926 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1929 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1930 webpage, u'thumbnail', fatal=False)
1932 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1933 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1939 'title': video_title,
1940 'thumbnail': thumbnail,
1941 'uploader': uploader,
1944 class FlickrIE(InfoExtractor):
1945 """Information Extractor for Flickr videos"""
1946 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1948 def _real_extract(self, url):
1949 mobj = re.match(self._VALID_URL, url)
1951 video_id = mobj.group('id')
1952 video_uploader_id = mobj.group('uploader_id')
1953 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1954 webpage = self._download_webpage(webpage_url, video_id)
1956 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1958 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1959 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1961 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1962 first_xml, u'node_id')
1964 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1965 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1967 self.report_extraction(video_id)
1969 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1971 raise ExtractorError(u'Unable to extract video url')
1972 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1974 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1975 webpage, u'video title')
1977 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1978 webpage, u'description', fatal=False)
1980 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1981 webpage, u'thumbnail', fatal=False)
1987 'title': video_title,
1988 'description': video_description,
1989 'thumbnail': thumbnail,
1990 'uploader_id': video_uploader_id,
1993 class TeamcocoIE(InfoExtractor):
1994 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1996 def _real_extract(self, url):
1997 mobj = re.match(self._VALID_URL, url)
1999 raise ExtractorError(u'Invalid URL: %s' % url)
2000 url_title = mobj.group('url_title')
2001 webpage = self._download_webpage(url, url_title)
2003 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2004 webpage, u'video id')
2006 self.report_extraction(video_id)
2008 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2011 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2012 webpage, u'thumbnail', fatal=False)
2014 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2015 webpage, u'description', fatal=False)
2017 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2018 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2020 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2027 'title': video_title,
2028 'thumbnail': thumbnail,
2029 'description': video_description,
2032 class XHamsterIE(InfoExtractor):
2033 """Information Extractor for xHamster"""
2034 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2036 def _real_extract(self,url):
2037 mobj = re.match(self._VALID_URL, url)
2039 video_id = mobj.group('id')
2040 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2041 webpage = self._download_webpage(mrss_url, video_id)
2043 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2045 raise ExtractorError(u'Unable to extract media URL')
2046 if len(mobj.group('server')) == 0:
2047 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2049 video_url = mobj.group('server')+'/key='+mobj.group('file')
2050 video_extension = video_url.split('.')[-1]
2052 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2055 # Can't see the description anywhere in the UI
2056 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2057 # webpage, u'description', fatal=False)
2058 # if video_description: video_description = unescapeHTML(video_description)
2060 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2062 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2064 video_upload_date = None
2065 self._downloader.report_warning(u'Unable to extract upload date')
2067 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2068 webpage, u'uploader id', default=u'anonymous')
2070 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2071 webpage, u'thumbnail', fatal=False)
2076 'ext': video_extension,
2077 'title': video_title,
2078 # 'description': video_description,
2079 'upload_date': video_upload_date,
2080 'uploader_id': video_uploader_id,
2081 'thumbnail': video_thumbnail
2084 class HypemIE(InfoExtractor):
2085 """Information Extractor for hypem"""
2086 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2088 def _real_extract(self, url):
2089 mobj = re.match(self._VALID_URL, url)
2091 raise ExtractorError(u'Invalid URL: %s' % url)
2092 track_id = mobj.group(1)
2094 data = { 'ax': 1, 'ts': time.time() }
2095 data_encoded = compat_urllib_parse.urlencode(data)
2096 complete_url = url + "?" + data_encoded
2097 request = compat_urllib_request.Request(complete_url)
2098 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2099 cookie = urlh.headers.get('Set-Cookie', '')
2101 self.report_extraction(track_id)
2103 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2104 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2106 track_list = json.loads(html_tracks)
2107 track = track_list[u'tracks'][0]
2109 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2112 track_id = track[u"id"]
2113 artist = track[u"artist"]
2114 title = track[u"song"]
2116 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2117 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2118 request.add_header('cookie', cookie)
2119 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2121 song_data = json.loads(song_data_json)
2123 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2124 final_url = song_data[u"url"]
2134 class Vbox7IE(InfoExtractor):
2135 """Information Extractor for Vbox7"""
2136 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2138 def _real_extract(self,url):
2139 mobj = re.match(self._VALID_URL, url)
2141 raise ExtractorError(u'Invalid URL: %s' % url)
2142 video_id = mobj.group(1)
2144 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2145 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2146 redirect_url = urlh.geturl() + new_location
2147 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2149 title = self._html_search_regex(r'<title>(.*)</title>',
2150 webpage, u'title').split('/')[0].strip()
2153 info_url = "http://vbox7.com/play/magare.do"
2154 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2155 info_request = compat_urllib_request.Request(info_url, data)
2156 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2157 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2158 if info_response is None:
2159 raise ExtractorError(u'Unable to extract the media url')
2160 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2167 'thumbnail': thumbnail_url,
2171 def gen_extractors():
2172 """ Return a list of an instance of every supported extractor.
2173 The order does matter; the first extractor matched is the one handling the URL.
2176 YoutubePlaylistIE(),
2201 StanfordOpenClassroomIE(),
2211 WorldStarHipHopIE(),
2241 def get_info_extractor(ie_name):
2242 """Returns the info extractor class with the given ie_name"""
2243 return globals()[ie_name+'IE']