10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.dailymotion import DailymotionIE
25 from .extractor.gametrailers import GametrailersIE
26 from .extractor.generic import GenericIE
27 from .extractor.metacafe import MetacafeIE
28 from .extractor.statigram import StatigramIE
29 from .extractor.photobucket import PhotobucketIE
30 from .extractor.vimeo import VimeoIE
31 from .extractor.yahoo import YahooIE, YahooSearchIE
32 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
33 from .extractor.zdf import ZDFIE
53 class DepositFilesIE(InfoExtractor):
54 """Information extractor for depositfiles.com"""
56 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
58 def _real_extract(self, url):
59 file_id = url.split('/')[-1]
60 # Rebuild url in english locale
61 url = 'http://depositfiles.com/en/files/' + file_id
63 # Retrieve file webpage with 'Free download' button pressed
64 free_download_indication = { 'gateway_result' : '1' }
65 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
67 self.report_download_webpage(file_id)
68 webpage = compat_urllib_request.urlopen(request).read()
69 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
70 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
72 # Search for the real file URL
73 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
74 if (mobj is None) or (mobj.group(1) is None):
75 # Try to figure out reason of the error.
76 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
77 if (mobj is not None) and (mobj.group(1) is not None):
78 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
79 raise ExtractorError(u'%s' % restriction_message)
81 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
83 file_url = mobj.group(1)
84 file_extension = os.path.splitext(file_url)[1][1:]
86 # Search for file title
87 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
90 'id': file_id.decode('utf-8'),
91 'url': file_url.decode('utf-8'),
95 'ext': file_extension.decode('utf-8'),
99 class FacebookIE(InfoExtractor):
100 """Information Extractor for Facebook"""
102 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
103 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
104 _NETRC_MACHINE = 'facebook'
105 IE_NAME = u'facebook'
107 def report_login(self):
108 """Report attempt to log in."""
109 self.to_screen(u'Logging in')
111 def _real_initialize(self):
112 if self._downloader is None:
117 downloader_params = self._downloader.params
119 # Attempt to use provided username and password or .netrc data
120 if downloader_params.get('username', None) is not None:
121 useremail = downloader_params['username']
122 password = downloader_params['password']
123 elif downloader_params.get('usenetrc', False):
125 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
130 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
131 except (IOError, netrc.NetrcParseError) as err:
132 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
135 if useremail is None:
144 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
147 login_results = compat_urllib_request.urlopen(request).read()
148 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
149 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
151 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
152 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
155 def _real_extract(self, url):
156 mobj = re.match(self._VALID_URL, url)
158 raise ExtractorError(u'Invalid URL: %s' % url)
159 video_id = mobj.group('ID')
161 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
162 webpage = self._download_webpage(url, video_id)
164 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
165 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
166 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
168 raise ExtractorError(u'Cannot parse data')
169 data = dict(json.loads(m.group(1)))
170 params_raw = compat_urllib_parse.unquote(data['params'])
171 params = json.loads(params_raw)
172 video_data = params['video_data'][0]
173 video_url = video_data.get('hd_src')
175 video_url = video_data['sd_src']
177 raise ExtractorError(u'Cannot find video URL')
178 video_duration = int(video_data['video_duration'])
179 thumbnail = video_data['thumbnail_src']
181 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
186 'title': video_title,
189 'duration': video_duration,
190 'thumbnail': thumbnail,
197 class MyVideoIE(InfoExtractor):
198 """Information Extractor for myvideo.de."""
200 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
203 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
204 # Released into the Public Domain by Tristan Fischer on 2013-05-19
205 # https://github.com/rg3/youtube-dl/pull/842
206 def __rc4crypt(self,data, key):
208 box = list(range(256))
209 for i in list(range(256)):
210 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
211 box[i], box[x] = box[x], box[i]
217 y = (y + box[x]) % 256
218 box[x], box[y] = box[y], box[x]
219 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
223 return hashlib.md5(s).hexdigest().encode()
225 def _real_extract(self,url):
226 mobj = re.match(self._VALID_URL, url)
228 raise ExtractorError(u'invalid URL: %s' % url)
230 video_id = mobj.group(1)
233 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
234 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
235 b'TnpsbA0KTVRkbU1tSTRNdz09'
239 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
240 webpage = self._download_webpage(webpage_url, video_id)
242 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
244 self.report_extraction(video_id)
245 video_url = mobj.group(1) + '.flv'
247 video_title = self._html_search_regex('<title>([^<]+)</title>',
250 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
257 'title': video_title,
262 mobj = re.search('var flashvars={(.+?)}', webpage)
264 raise ExtractorError(u'Unable to extract video')
269 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
270 if not a == '_encxml':
273 encxml = compat_urllib_parse.unquote(b)
274 if not params.get('domain'):
275 params['domain'] = 'www.myvideo.de'
276 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
277 if 'flash_playertype=MTV' in xmldata_url:
278 self._downloader.report_warning(u'avoiding MTV player')
280 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
281 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
285 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
286 enc_data_b = binascii.unhexlify(enc_data)
288 base64.b64decode(base64.b64decode(GK)) +
290 str(video_id).encode('utf-8')
293 dec_data = self.__rc4crypt(enc_data_b, sk)
296 self.report_extraction(video_id)
299 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
301 video_url = compat_urllib_parse.unquote(mobj.group(1))
302 if 'myvideo2flash' in video_url:
303 self._downloader.report_warning(u'forcing RTMPT ...')
304 video_url = video_url.replace('rtmpe://', 'rtmpt://')
307 # extract non rtmp videos
308 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
310 raise ExtractorError(u'unable to extract url')
311 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
313 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
314 video_file = compat_urllib_parse.unquote(video_file)
316 if not video_file.endswith('f4m'):
317 ppath, prefix = video_file.split('.')
318 video_playpath = '%s:%s' % (prefix, ppath)
319 video_hls_playlist = ''
322 video_hls_playlist = (
323 video_filepath + video_file
324 ).replace('.f4m', '.m3u8')
326 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
327 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
329 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
338 'title': video_title,
340 'play_path': video_playpath,
341 'video_file': video_file,
342 'video_hls_playlist': video_hls_playlist,
343 'player_url': video_swfobj,
347 class ComedyCentralIE(InfoExtractor):
348 """Information extractor for The Daily Show and Colbert Report """
350 # urls can be abbreviations like :thedailyshow or :colbert
351 # urls for episodes like:
352 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
353 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
354 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
355 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
356 |(https?://)?(www\.)?
357 (?P<showname>thedailyshow|colbertnation)\.com/
358 (full-episodes/(?P<episode>.*)|
360 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
361 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
364 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
366 _video_extensions = {
374 _video_dimensions = {
384 def suitable(cls, url):
385 """Receives a URL and returns True if suitable for this IE."""
386 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
388 def _print_formats(self, formats):
389 print('Available formats:')
391 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
394 def _real_extract(self, url):
395 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
397 raise ExtractorError(u'Invalid URL: %s' % url)
399 if mobj.group('shortname'):
400 if mobj.group('shortname') in ('tds', 'thedailyshow'):
401 url = u'http://www.thedailyshow.com/full-episodes/'
403 url = u'http://www.colbertnation.com/full-episodes/'
404 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
405 assert mobj is not None
407 if mobj.group('clip'):
408 if mobj.group('showname') == 'thedailyshow':
409 epTitle = mobj.group('tdstitle')
411 epTitle = mobj.group('cntitle')
414 dlNewest = not mobj.group('episode')
416 epTitle = mobj.group('showname')
418 epTitle = mobj.group('episode')
420 self.report_extraction(epTitle)
421 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
423 url = htmlHandle.geturl()
424 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
426 raise ExtractorError(u'Invalid redirected URL: ' + url)
427 if mobj.group('episode') == '':
428 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
429 epTitle = mobj.group('episode')
431 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
433 if len(mMovieParams) == 0:
434 # The Colbert Report embeds the information in a without
435 # a URL prefix; so extract the alternate reference
436 # and then add the URL prefix manually.
438 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
439 if len(altMovieParams) == 0:
440 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
442 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
444 uri = mMovieParams[0][1]
445 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
446 indexXml = self._download_webpage(indexUrl, epTitle,
447 u'Downloading show index',
448 u'unable to download episode index')
452 idoc = xml.etree.ElementTree.fromstring(indexXml)
453 itemEls = idoc.findall('.//item')
454 for partNum,itemEl in enumerate(itemEls):
455 mediaId = itemEl.findall('./guid')[0].text
456 shortMediaId = mediaId.split(':')[-1]
457 showId = mediaId.split(':')[-2].replace('.com', '')
458 officialTitle = itemEl.findall('./title')[0].text
459 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
461 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
462 compat_urllib_parse.urlencode({'uri': mediaId}))
463 configXml = self._download_webpage(configUrl, epTitle,
464 u'Downloading configuration for %s' % shortMediaId)
466 cdoc = xml.etree.ElementTree.fromstring(configXml)
468 for rendition in cdoc.findall('.//rendition'):
469 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
473 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
476 if self._downloader.params.get('listformats', None):
477 self._print_formats([i[0] for i in turls])
480 # For now, just pick the highest bitrate
481 format,rtmp_video_url = turls[-1]
483 # Get the format arg from the arg stream
484 req_format = self._downloader.params.get('format', None)
486 # Select format if we can find one
489 format, rtmp_video_url = f, v
492 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
494 raise ExtractorError(u'Cannot transform RTMP url')
495 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
496 video_url = base + m.group('finalid')
498 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
503 'upload_date': officialDate,
508 'description': officialTitle,
515 class EscapistIE(InfoExtractor):
516 """Information extractor for The Escapist """
518 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
519 IE_NAME = u'escapist'
521 def _real_extract(self, url):
522 mobj = re.match(self._VALID_URL, url)
524 raise ExtractorError(u'Invalid URL: %s' % url)
525 showName = mobj.group('showname')
526 videoId = mobj.group('episode')
528 self.report_extraction(videoId)
529 webpage = self._download_webpage(url, videoId)
531 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
532 webpage, u'description', fatal=False)
534 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
535 webpage, u'thumbnail', fatal=False)
537 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
538 webpage, u'player url')
540 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
541 webpage, u'player url').split(' : ')[-1]
543 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
544 configUrl = compat_urllib_parse.unquote(configUrl)
546 configJSON = self._download_webpage(configUrl, videoId,
547 u'Downloading configuration',
548 u'unable to download configuration')
550 # Technically, it's JavaScript, not JSON
551 configJSON = configJSON.replace("'", '"')
554 config = json.loads(configJSON)
555 except (ValueError,) as err:
556 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
558 playlist = config['playlist']
559 videoUrl = playlist[1]['url']
564 'uploader': showName,
569 'description': videoDesc,
570 'player_url': playerUrl,
575 class CollegeHumorIE(InfoExtractor):
576 """Information extractor for collegehumor.com"""
579 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
580 IE_NAME = u'collegehumor'
582 def report_manifest(self, video_id):
583 """Report information extraction."""
584 self.to_screen(u'%s: Downloading XML manifest' % video_id)
586 def _real_extract(self, url):
587 mobj = re.match(self._VALID_URL, url)
589 raise ExtractorError(u'Invalid URL: %s' % url)
590 video_id = mobj.group('videoid')
598 self.report_extraction(video_id)
599 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
601 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
602 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
603 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
605 mdoc = xml.etree.ElementTree.fromstring(metaXml)
607 videoNode = mdoc.findall('./video')[0]
608 info['description'] = videoNode.findall('./description')[0].text
609 info['title'] = videoNode.findall('./caption')[0].text
610 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
611 manifest_url = videoNode.findall('./file')[0].text
613 raise ExtractorError(u'Invalid metadata XML file')
615 manifest_url += '?hdcore=2.10.3'
616 self.report_manifest(video_id)
618 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
619 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
620 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
622 adoc = xml.etree.ElementTree.fromstring(manifestXml)
624 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
625 node_id = media_node.attrib['url']
626 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
627 except IndexError as err:
628 raise ExtractorError(u'Invalid manifest file')
630 url_pr = compat_urllib_parse_urlparse(manifest_url)
631 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
638 class XVideosIE(InfoExtractor):
639 """Information extractor for xvideos.com"""
641 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
644 def _real_extract(self, url):
645 mobj = re.match(self._VALID_URL, url)
647 raise ExtractorError(u'Invalid URL: %s' % url)
648 video_id = mobj.group(1)
650 webpage = self._download_webpage(url, video_id)
652 self.report_extraction(video_id)
655 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
656 webpage, u'video URL'))
659 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
662 # Extract video thumbnail
663 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
664 webpage, u'thumbnail', fatal=False)
671 'title': video_title,
673 'thumbnail': video_thumbnail,
680 class SoundcloudIE(InfoExtractor):
681 """Information extractor for soundcloud.com
682 To access the media, the uid of the song and a stream token
683 must be extracted from the page source and the script must make
684 a request to media.soundcloud.com/crossdomain.xml. Then
685 the media can be grabbed by requesting from an url composed
686 of the stream token and uid
689 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
690 IE_NAME = u'soundcloud'
692 def report_resolve(self, video_id):
693 """Report information extraction."""
694 self.to_screen(u'%s: Resolving id' % video_id)
696 def _real_extract(self, url):
697 mobj = re.match(self._VALID_URL, url)
699 raise ExtractorError(u'Invalid URL: %s' % url)
701 # extract uploader (which is in the url)
702 uploader = mobj.group(1)
703 # extract simple title (uploader + slug of song title)
704 slug_title = mobj.group(2)
705 simple_title = uploader + u'-' + slug_title
706 full_title = '%s/%s' % (uploader, slug_title)
708 self.report_resolve(full_title)
710 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
711 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
712 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
714 info = json.loads(info_json)
715 video_id = info['id']
716 self.report_extraction(full_title)
718 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
719 stream_json = self._download_webpage(streams_url, full_title,
720 u'Downloading stream definitions',
721 u'unable to download stream definitions')
723 streams = json.loads(stream_json)
724 mediaURL = streams['http_mp3_128_url']
725 upload_date = unified_strdate(info['created_at'])
730 'uploader': info['user']['username'],
731 'upload_date': upload_date,
732 'title': info['title'],
734 'description': info['description'],
737 class SoundcloudSetIE(InfoExtractor):
738 """Information extractor for soundcloud.com sets
739 To access the media, the uid of the song and a stream token
740 must be extracted from the page source and the script must make
741 a request to media.soundcloud.com/crossdomain.xml. Then
742 the media can be grabbed by requesting from an url composed
743 of the stream token and uid
746 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
747 IE_NAME = u'soundcloud:set'
749 def report_resolve(self, video_id):
750 """Report information extraction."""
751 self.to_screen(u'%s: Resolving id' % video_id)
753 def _real_extract(self, url):
754 mobj = re.match(self._VALID_URL, url)
756 raise ExtractorError(u'Invalid URL: %s' % url)
758 # extract uploader (which is in the url)
759 uploader = mobj.group(1)
760 # extract simple title (uploader + slug of song title)
761 slug_title = mobj.group(2)
762 simple_title = uploader + u'-' + slug_title
763 full_title = '%s/sets/%s' % (uploader, slug_title)
765 self.report_resolve(full_title)
767 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
768 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
769 info_json = self._download_webpage(resolv_url, full_title)
772 info = json.loads(info_json)
774 for err in info['errors']:
775 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
778 self.report_extraction(full_title)
779 for track in info['tracks']:
780 video_id = track['id']
782 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
783 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
785 self.report_extraction(video_id)
786 streams = json.loads(stream_json)
787 mediaURL = streams['http_mp3_128_url']
792 'uploader': track['user']['username'],
793 'upload_date': unified_strdate(track['created_at']),
794 'title': track['title'],
796 'description': track['description'],
801 class InfoQIE(InfoExtractor):
802 """Information extractor for infoq.com"""
803 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
805 def _real_extract(self, url):
806 mobj = re.match(self._VALID_URL, url)
808 raise ExtractorError(u'Invalid URL: %s' % url)
810 webpage = self._download_webpage(url, video_id=url)
811 self.report_extraction(url)
814 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
816 raise ExtractorError(u'Unable to extract video url')
817 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
818 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
821 video_title = self._search_regex(r'contentTitle = "(.*?)";',
824 # Extract description
825 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
826 webpage, u'description', fatal=False)
828 video_filename = video_url.split('/')[-1]
829 video_id, extension = video_filename.split('.')
836 'title': video_title,
837 'ext': extension, # Extension is always(?) mp4, but seems to be flv
839 'description': video_description,
844 class MixcloudIE(InfoExtractor):
845 """Information extractor for www.mixcloud.com"""
847 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
848 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
849 IE_NAME = u'mixcloud'
851 def report_download_json(self, file_id):
852 """Report JSON download."""
853 self.to_screen(u'Downloading json')
855 def get_urls(self, jsonData, fmt, bitrate='best'):
856 """Get urls from 'audio_formats' section in json"""
859 bitrate_list = jsonData[fmt]
860 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
861 bitrate = max(bitrate_list) # select highest
863 url_list = jsonData[fmt][bitrate]
864 except TypeError: # we have no bitrate info.
865 url_list = jsonData[fmt]
868 def check_urls(self, url_list):
869 """Returns 1st active url from list"""
872 compat_urllib_request.urlopen(url)
874 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
879 def _print_formats(self, formats):
880 print('Available formats:')
881 for fmt in formats.keys():
882 for b in formats[fmt]:
884 ext = formats[fmt][b][0]
885 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
886 except TypeError: # we have no bitrate info
887 ext = formats[fmt][0]
888 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
891 def _real_extract(self, url):
892 mobj = re.match(self._VALID_URL, url)
894 raise ExtractorError(u'Invalid URL: %s' % url)
895 # extract uploader & filename from url
896 uploader = mobj.group(1).decode('utf-8')
897 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
899 # construct API request
900 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
901 # retrieve .json file with links to files
902 request = compat_urllib_request.Request(file_url)
904 self.report_download_json(file_url)
905 jsonData = compat_urllib_request.urlopen(request).read()
906 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
907 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
910 json_data = json.loads(jsonData)
911 player_url = json_data['player_swf_url']
912 formats = dict(json_data['audio_formats'])
914 req_format = self._downloader.params.get('format', None)
917 if self._downloader.params.get('listformats', None):
918 self._print_formats(formats)
921 if req_format is None or req_format == 'best':
922 for format_param in formats.keys():
923 url_list = self.get_urls(formats, format_param)
925 file_url = self.check_urls(url_list)
926 if file_url is not None:
929 if req_format not in formats:
930 raise ExtractorError(u'Format is not available')
932 url_list = self.get_urls(formats, req_format)
933 file_url = self.check_urls(url_list)
934 format_param = req_format
937 'id': file_id.decode('utf-8'),
938 'url': file_url.decode('utf-8'),
939 'uploader': uploader.decode('utf-8'),
941 'title': json_data['name'],
942 'ext': file_url.split('.')[-1].decode('utf-8'),
943 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
944 'thumbnail': json_data['thumbnail_url'],
945 'description': json_data['description'],
946 'player_url': player_url.decode('utf-8'),
949 class StanfordOpenClassroomIE(InfoExtractor):
950 """Information extractor for Stanford's Open ClassRoom"""
952 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
953 IE_NAME = u'stanfordoc'
955 def _real_extract(self, url):
956 mobj = re.match(self._VALID_URL, url)
958 raise ExtractorError(u'Invalid URL: %s' % url)
960 if mobj.group('course') and mobj.group('video'): # A specific video
961 course = mobj.group('course')
962 video = mobj.group('video')
964 'id': course + '_' + video,
969 self.report_extraction(info['id'])
970 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
971 xmlUrl = baseUrl + video + '.xml'
973 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
974 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
975 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
976 mdoc = xml.etree.ElementTree.fromstring(metaXml)
978 info['title'] = mdoc.findall('./title')[0].text
979 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
981 raise ExtractorError(u'Invalid metadata XML file')
982 info['ext'] = info['url'].rpartition('.')[2]
984 elif mobj.group('course'): # A course page
985 course = mobj.group('course')
993 coursepage = self._download_webpage(url, info['id'],
994 note='Downloading course info page',
995 errnote='Unable to download course info page')
997 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
999 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1000 coursepage, u'description', fatal=False)
1002 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1005 'type': 'reference',
1006 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1010 for entry in info['list']:
1011 assert entry['type'] == 'reference'
1012 results += self.extract(entry['url'])
1016 'id': 'Stanford OpenClassroom',
1019 'upload_date': None,
1022 self.report_download_webpage(info['id'])
1023 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1025 rootpage = compat_urllib_request.urlopen(rootURL).read()
1026 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1027 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1029 info['title'] = info['id']
1031 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1034 'type': 'reference',
1035 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1040 for entry in info['list']:
1041 assert entry['type'] == 'reference'
1042 results += self.extract(entry['url'])
1045 class MTVIE(InfoExtractor):
1046 """Information extractor for MTV.com"""
1048 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1051 def _real_extract(self, url):
1052 mobj = re.match(self._VALID_URL, url)
1054 raise ExtractorError(u'Invalid URL: %s' % url)
1055 if not mobj.group('proto'):
1056 url = 'http://' + url
1057 video_id = mobj.group('videoid')
1059 webpage = self._download_webpage(url, video_id)
1061 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1062 webpage, u'song name', fatal=False)
1064 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1067 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1068 webpage, u'mtvn_uri', fatal=False)
1070 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1071 webpage, u'content id', fatal=False)
1073 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1074 self.report_extraction(video_id)
1075 request = compat_urllib_request.Request(videogen_url)
1077 metadataXml = compat_urllib_request.urlopen(request).read()
1078 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1079 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1081 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1082 renditions = mdoc.findall('.//rendition')
1084 # For now, always pick the highest quality.
1085 rendition = renditions[-1]
1088 _,_,ext = rendition.attrib['type'].partition('/')
1089 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1090 video_url = rendition.find('./src').text
1092 raise ExtractorError('Invalid rendition field.')
1097 'uploader': performer,
1098 'upload_date': None,
1099 'title': video_title,
1107 class YoukuIE(InfoExtractor):
1108 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1111 nowTime = int(time.time() * 1000)
1112 random1 = random.randint(1000,1998)
1113 random2 = random.randint(1000,9999)
1115 return "%d%d%d" %(nowTime,random1,random2)
1117 def _get_file_ID_mix_string(self, seed):
1119 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1121 for i in range(len(source)):
1122 seed = (seed * 211 + 30031 ) % 65536
1123 index = math.floor(seed / 65536 * len(source) )
1124 mixed.append(source[int(index)])
1125 source.remove(source[int(index)])
1126 #return ''.join(mixed)
1129 def _get_file_id(self, fileId, seed):
1130 mixed = self._get_file_ID_mix_string(seed)
1131 ids = fileId.split('*')
1135 realId.append(mixed[int(ch)])
1136 return ''.join(realId)
1138 def _real_extract(self, url):
1139 mobj = re.match(self._VALID_URL, url)
1141 raise ExtractorError(u'Invalid URL: %s' % url)
1142 video_id = mobj.group('ID')
1144 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1146 jsondata = self._download_webpage(info_url, video_id)
1148 self.report_extraction(video_id)
1150 config = json.loads(jsondata)
1152 video_title = config['data'][0]['title']
1153 seed = config['data'][0]['seed']
1155 format = self._downloader.params.get('format', None)
1156 supported_format = list(config['data'][0]['streamfileids'].keys())
1158 if format is None or format == 'best':
1159 if 'hd2' in supported_format:
1164 elif format == 'worst':
1172 fileid = config['data'][0]['streamfileids'][format]
1173 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1174 except (UnicodeDecodeError, ValueError, KeyError):
1175 raise ExtractorError(u'Unable to extract info section')
1178 sid = self._gen_sid()
1179 fileid = self._get_file_id(fileid, seed)
1181 #column 8,9 of fileid represent the segment number
1182 #fileid[7:9] should be changed
1183 for index, key in enumerate(keys):
1185 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1186 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1189 'id': '%s_part%02d' % (video_id, index),
1190 'url': download_url,
1192 'upload_date': None,
1193 'title': video_title,
1196 files_info.append(info)
1201 class XNXXIE(InfoExtractor):
1202 """Information extractor for xnxx.com"""
1204 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1206 VIDEO_URL_RE = r'flv_url=(.*?)&'
1207 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1208 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
1210 def _real_extract(self, url):
1211 mobj = re.match(self._VALID_URL, url)
1213 raise ExtractorError(u'Invalid URL: %s' % url)
1214 video_id = mobj.group(1)
1216 # Get webpage content
1217 webpage = self._download_webpage(url, video_id)
1219 video_url = self._search_regex(self.VIDEO_URL_RE,
1220 webpage, u'video URL')
1221 video_url = compat_urllib_parse.unquote(video_url)
1223 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1226 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1227 webpage, u'thumbnail', fatal=False)
1233 'upload_date': None,
1234 'title': video_title,
1236 'thumbnail': video_thumbnail,
1237 'description': None,
1241 class GooglePlusIE(InfoExtractor):
1242 """Information extractor for plus.google.com."""
1244 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1245 IE_NAME = u'plus.google'
1247 def _real_extract(self, url):
1248 # Extract id from URL
1249 mobj = re.match(self._VALID_URL, url)
1251 raise ExtractorError(u'Invalid URL: %s' % url)
1253 post_url = mobj.group(0)
1254 video_id = mobj.group(1)
1256 video_extension = 'flv'
1258 # Step 1, Retrieve post webpage to extract further information
1259 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1261 self.report_extraction(video_id)
1263 # Extract update date
1264 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1265 webpage, u'upload date', fatal=False)
1267 # Convert timestring to a format suitable for filename
1268 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1269 upload_date = upload_date.strftime('%Y%m%d')
1272 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1273 webpage, u'uploader', fatal=False)
1276 # Get the first line for title
1277 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1278 webpage, 'title', default=u'NA')
1280 # Step 2, Stimulate clicking the image box to launch video
1281 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1282 webpage, u'video page URL')
1283 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1285 # Extract video links on video page
1286 """Extract video links of all sizes"""
1287 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1288 mobj = re.findall(pattern, webpage)
1290 raise ExtractorError(u'Unable to extract video links')
1292 # Sort in resolution
1293 links = sorted(mobj)
1295 # Choose the lowest of the sort, i.e. highest resolution
1296 video_url = links[-1]
1297 # Only get the url. The resolution part in the tuple has no use anymore
1298 video_url = video_url[-1]
1299 # Treat escaped \u0026 style hex
1301 video_url = video_url.decode("unicode_escape")
1302 except AttributeError: # Python 3
1303 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1309 'uploader': uploader,
1310 'upload_date': upload_date,
1311 'title': video_title,
1312 'ext': video_extension,
1315 class NBAIE(InfoExtractor):
1316 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1319 def _real_extract(self, url):
1320 mobj = re.match(self._VALID_URL, url)
1322 raise ExtractorError(u'Invalid URL: %s' % url)
1324 video_id = mobj.group(1)
1326 webpage = self._download_webpage(url, video_id)
1328 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1330 shortened_video_id = video_id.rpartition('/')[2]
1331 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1332 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1334 # It isn't there in the HTML it returns to us
1335 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1337 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1340 'id': shortened_video_id,
1344 # 'uploader_date': uploader_date,
1345 'description': description,
1349 class JustinTVIE(InfoExtractor):
1350 """Information extractor for justin.tv and twitch.tv"""
1351 # TODO: One broadcast may be split into multiple videos. The key
1352 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1353 # starts at 1 and increases. Can we treat all parts as one video?
1355 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1357 (?P<channelid>[^/]+)|
1358 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1359 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1363 _JUSTIN_PAGE_LIMIT = 100
1364 IE_NAME = u'justin.tv'
1366 def report_download_page(self, channel, offset):
1367 """Report attempt to download a single page of videos."""
1368 self.to_screen(u'%s: Downloading video information from %d to %d' %
1369 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1371 # Return count of items, list of *valid* items
1372 def _parse_page(self, url, video_id):
1373 webpage = self._download_webpage(url, video_id,
1374 u'Downloading video info JSON',
1375 u'unable to download video info JSON')
1377 response = json.loads(webpage)
1378 if type(response) != list:
1379 error_text = response.get('error', 'unknown error')
1380 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1382 for clip in response:
1383 video_url = clip['video_file_url']
1385 video_extension = os.path.splitext(video_url)[1][1:]
1386 video_date = re.sub('-', '', clip['start_time'][:10])
1387 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1388 video_id = clip['id']
1389 video_title = clip.get('title', video_id)
1393 'title': video_title,
1394 'uploader': clip.get('channel_name', video_uploader_id),
1395 'uploader_id': video_uploader_id,
1396 'upload_date': video_date,
1397 'ext': video_extension,
1399 return (len(response), info)
1401 def _real_extract(self, url):
1402 mobj = re.match(self._VALID_URL, url)
1404 raise ExtractorError(u'invalid URL: %s' % url)
1406 api_base = 'http://api.justin.tv'
1408 if mobj.group('channelid'):
1410 video_id = mobj.group('channelid')
1411 api = api_base + '/channel/archives/%s.json' % video_id
1412 elif mobj.group('chapterid'):
1413 chapter_id = mobj.group('chapterid')
1415 webpage = self._download_webpage(url, chapter_id)
1416 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1418 raise ExtractorError(u'Cannot find archive of a chapter')
1419 archive_id = m.group(1)
1421 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1422 chapter_info_xml = self._download_webpage(api, chapter_id,
1423 note=u'Downloading chapter information',
1424 errnote=u'Chapter information download failed')
1425 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1426 for a in doc.findall('.//archive'):
1427 if archive_id == a.find('./id').text:
1430 raise ExtractorError(u'Could not find chapter in chapter information')
1432 video_url = a.find('./video_file_url').text
1433 video_ext = video_url.rpartition('.')[2] or u'flv'
1435 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1436 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1437 note='Downloading chapter metadata',
1438 errnote='Download of chapter metadata failed')
1439 chapter_info = json.loads(chapter_info_json)
1441 bracket_start = int(doc.find('.//bracket_start').text)
1442 bracket_end = int(doc.find('.//bracket_end').text)
1444 # TODO determine start (and probably fix up file)
1445 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1446 #video_url += u'?start=' + TODO:start_timestamp
1447 # bracket_start is 13290, but we want 51670615
1448 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1449 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1452 'id': u'c' + chapter_id,
1455 'title': chapter_info['title'],
1456 'thumbnail': chapter_info['preview'],
1457 'description': chapter_info['description'],
1458 'uploader': chapter_info['channel']['display_name'],
1459 'uploader_id': chapter_info['channel']['name'],
1463 video_id = mobj.group('videoid')
1464 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1466 self.report_extraction(video_id)
1470 limit = self._JUSTIN_PAGE_LIMIT
1473 self.report_download_page(video_id, offset)
1474 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1475 page_count, page_info = self._parse_page(page_url, video_id)
1476 info.extend(page_info)
1477 if not paged or page_count != limit:
1482 class FunnyOrDieIE(InfoExtractor):
1483 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1485 def _real_extract(self, url):
1486 mobj = re.match(self._VALID_URL, url)
1488 raise ExtractorError(u'invalid URL: %s' % url)
1490 video_id = mobj.group('id')
1491 webpage = self._download_webpage(url, video_id)
1493 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1494 webpage, u'video URL', flags=re.DOTALL)
1496 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1497 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1499 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1500 webpage, u'description', fatal=False, flags=re.DOTALL)
1507 'description': video_description,
1511 class SteamIE(InfoExtractor):
1512 _VALID_URL = r"""http://store\.steampowered\.com/
1514 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1516 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1518 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1519 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1522 def suitable(cls, url):
1523 """Receives a URL and returns True if suitable for this IE."""
1524 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1526 def _real_extract(self, url):
1527 m = re.match(self._VALID_URL, url, re.VERBOSE)
1528 gameID = m.group('gameID')
1530 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1531 webpage = self._download_webpage(videourl, gameID)
1533 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1534 videourl = self._AGECHECK_TEMPLATE % gameID
1535 self.report_age_confirmation()
1536 webpage = self._download_webpage(videourl, gameID)
1538 self.report_extraction(gameID)
1539 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1540 webpage, 'game title')
1542 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1543 mweb = re.finditer(urlRE, webpage)
1544 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1545 titles = re.finditer(namesRE, webpage)
1546 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1547 thumbs = re.finditer(thumbsRE, webpage)
1549 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1550 video_id = vid.group('videoID')
1551 title = vtitle.group('videoName')
1552 video_url = vid.group('videoURL')
1553 video_thumb = thumb.group('thumbnail')
1555 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1560 'title': unescapeHTML(title),
1561 'thumbnail': video_thumb
1564 return [self.playlist_result(videos, gameID, game_title)]
1566 class UstreamIE(InfoExtractor):
1567 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1568 IE_NAME = u'ustream'
1570 def _real_extract(self, url):
1571 m = re.match(self._VALID_URL, url)
1572 video_id = m.group('videoID')
1574 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1575 webpage = self._download_webpage(url, video_id)
1577 self.report_extraction(video_id)
1579 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1582 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1583 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1585 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1586 webpage, u'thumbnail', fatal=False)
1592 'title': video_title,
1593 'uploader': uploader,
1594 'thumbnail': thumbnail,
1598 class WorldStarHipHopIE(InfoExtractor):
1599 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1600 IE_NAME = u'WorldStarHipHop'
1602 def _real_extract(self, url):
1603 m = re.match(self._VALID_URL, url)
1604 video_id = m.group('id')
1606 webpage_src = self._download_webpage(url, video_id)
1608 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1609 webpage_src, u'video URL')
1611 if 'mp4' in video_url:
1616 video_title = self._html_search_regex(r"<title>(.*)</title>",
1617 webpage_src, u'title')
1619 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1620 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1621 webpage_src, u'thumbnail', fatal=False)
1624 _title = r"""candytitles.*>(.*)</span>"""
1625 mobj = re.search(_title, webpage_src)
1626 if mobj is not None:
1627 video_title = mobj.group(1)
1632 'title' : video_title,
1633 'thumbnail' : thumbnail,
1638 class RBMARadioIE(InfoExtractor):
1639 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1641 def _real_extract(self, url):
1642 m = re.match(self._VALID_URL, url)
1643 video_id = m.group('videoID')
1645 webpage = self._download_webpage(url, video_id)
1647 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1648 webpage, u'json data', flags=re.MULTILINE)
1651 data = json.loads(json_data)
1652 except ValueError as e:
1653 raise ExtractorError(u'Invalid JSON: ' + str(e))
1655 video_url = data['akamai_url'] + '&cbr=256'
1656 url_parts = compat_urllib_parse_urlparse(video_url)
1657 video_ext = url_parts.path.rpartition('.')[2]
1662 'title': data['title'],
1663 'description': data.get('teaser_text'),
1664 'location': data.get('country_of_origin'),
1665 'uploader': data.get('host', {}).get('name'),
1666 'uploader_id': data.get('host', {}).get('slug'),
1667 'thumbnail': data.get('image', {}).get('large_url_2x'),
1668 'duration': data.get('duration'),
1673 class YouPornIE(InfoExtractor):
1674 """Information extractor for youporn.com."""
1675 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1677 def _print_formats(self, formats):
1678 """Print all available formats"""
1679 print(u'Available formats:')
1680 print(u'ext\t\tformat')
1681 print(u'---------------------------------')
1682 for format in formats:
1683 print(u'%s\t\t%s' % (format['ext'], format['format']))
1685 def _specific(self, req_format, formats):
1687 if(x["format"]==req_format):
1691 def _real_extract(self, url):
1692 mobj = re.match(self._VALID_URL, url)
1694 raise ExtractorError(u'Invalid URL: %s' % url)
1695 video_id = mobj.group('videoid')
1697 req = compat_urllib_request.Request(url)
1698 req.add_header('Cookie', 'age_verified=1')
1699 webpage = self._download_webpage(req, video_id)
1701 # Get JSON parameters
1702 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1704 params = json.loads(json_params)
1706 raise ExtractorError(u'Invalid JSON')
1708 self.report_extraction(video_id)
1710 video_title = params['title']
1711 upload_date = unified_strdate(params['release_date_f'])
1712 video_description = params['description']
1713 video_uploader = params['submitted_by']
1714 thumbnail = params['thumbnails'][0]['image']
1716 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1718 # Get all of the formats available
1719 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1720 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1721 webpage, u'download list').strip()
1723 # Get all of the links from the page
1724 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1725 links = re.findall(LINK_RE, download_list_html)
1726 if(len(links) == 0):
1727 raise ExtractorError(u'ERROR: no known formats available for video')
1729 self.to_screen(u'Links found: %d' % len(links))
1734 # A link looks like this:
1735 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1736 # A path looks like this:
1737 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1738 video_url = unescapeHTML( link )
1739 path = compat_urllib_parse_urlparse( video_url ).path
1740 extension = os.path.splitext( path )[1][1:]
1741 format = path.split('/')[4].split('_')[:2]
1744 format = "-".join( format )
1745 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1750 'uploader': video_uploader,
1751 'upload_date': upload_date,
1752 'title': video_title,
1755 'thumbnail': thumbnail,
1756 'description': video_description
1759 if self._downloader.params.get('listformats', None):
1760 self._print_formats(formats)
1763 req_format = self._downloader.params.get('format', None)
1764 self.to_screen(u'Format: %s' % req_format)
1766 if req_format is None or req_format == 'best':
1768 elif req_format == 'worst':
1769 return [formats[-1]]
1770 elif req_format in ('-1', 'all'):
1773 format = self._specific( req_format, formats )
1775 raise ExtractorError(u'Requested format not available')
1780 class PornotubeIE(InfoExtractor):
1781 """Information extractor for pornotube.com."""
1782 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1784 def _real_extract(self, url):
1785 mobj = re.match(self._VALID_URL, url)
1787 raise ExtractorError(u'Invalid URL: %s' % url)
1789 video_id = mobj.group('videoid')
1790 video_title = mobj.group('title')
1792 # Get webpage content
1793 webpage = self._download_webpage(url, video_id)
1796 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1797 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1798 video_url = compat_urllib_parse.unquote(video_url)
1800 #Get the uploaded date
1801 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1802 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1803 if upload_date: upload_date = unified_strdate(upload_date)
1805 info = {'id': video_id,
1808 'upload_date': upload_date,
1809 'title': video_title,
1815 class YouJizzIE(InfoExtractor):
1816 """Information extractor for youjizz.com."""
1817 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1819 def _real_extract(self, url):
1820 mobj = re.match(self._VALID_URL, url)
1822 raise ExtractorError(u'Invalid URL: %s' % url)
1824 video_id = mobj.group('videoid')
1826 # Get webpage content
1827 webpage = self._download_webpage(url, video_id)
1829 # Get the video title
1830 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1831 webpage, u'title').strip()
1833 # Get the embed page
1834 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1836 raise ExtractorError(u'ERROR: unable to extract embed page')
1838 embed_page_url = result.group(0).strip()
1839 video_id = result.group('videoid')
1841 webpage = self._download_webpage(embed_page_url, video_id)
1844 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1845 webpage, u'video URL')
1847 info = {'id': video_id,
1849 'title': video_title,
1852 'player_url': embed_page_url}
1856 class EightTracksIE(InfoExtractor):
1858 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1860 def _real_extract(self, url):
1861 mobj = re.match(self._VALID_URL, url)
1863 raise ExtractorError(u'Invalid URL: %s' % url)
1864 playlist_id = mobj.group('id')
1866 webpage = self._download_webpage(url, playlist_id)
1868 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1869 data = json.loads(json_like)
1871 session = str(random.randint(0, 1000000000))
1873 track_count = data['tracks_count']
1874 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1875 next_url = first_url
1877 for i in itertools.count():
1878 api_json = self._download_webpage(next_url, playlist_id,
1879 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1880 errnote=u'Failed to download song information')
1881 api_data = json.loads(api_json)
1882 track_data = api_data[u'set']['track']
1884 'id': track_data['id'],
1885 'url': track_data['track_file_stream_url'],
1886 'title': track_data['performer'] + u' - ' + track_data['name'],
1887 'raw_title': track_data['name'],
1888 'uploader_id': data['user']['login'],
1892 if api_data['set']['at_last_track']:
1894 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1897 class KeekIE(InfoExtractor):
1898 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1901 def _real_extract(self, url):
1902 m = re.match(self._VALID_URL, url)
1903 video_id = m.group('videoID')
1905 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1906 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1907 webpage = self._download_webpage(url, video_id)
1909 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1912 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1913 webpage, u'uploader', fatal=False)
1919 'title': video_title,
1920 'thumbnail': thumbnail,
1921 'uploader': uploader
1925 class TEDIE(InfoExtractor):
1926 _VALID_URL=r'''http://www\.ted\.com/
1928 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1930 ((?P<type_talk>talks)) # We have a simple talk
1932 (/lang/(.*?))? # The url may contain the language
1933 /(?P<name>\w+) # Here goes the name and then ".html"
1937 def suitable(cls, url):
1938 """Receives a URL and returns True if suitable for this IE."""
1939 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1941 def _real_extract(self, url):
1942 m=re.match(self._VALID_URL, url, re.VERBOSE)
1943 if m.group('type_talk'):
1944 return [self._talk_info(url)]
1946 playlist_id=m.group('playlist_id')
1947 name=m.group('name')
1948 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1949 return [self._playlist_videos_info(url,name,playlist_id)]
1951 def _playlist_videos_info(self,url,name,playlist_id=0):
1952 '''Returns the videos of the playlist'''
1954 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1955 ([.\s]*?)data-playlist_item_id="(\d+)"
1956 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1958 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1959 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1960 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1961 m_names=re.finditer(video_name_RE,webpage)
1963 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1964 webpage, 'playlist title')
1966 playlist_entries = []
1967 for m_video, m_name in zip(m_videos,m_names):
1968 video_id=m_video.group('video_id')
1969 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1970 playlist_entries.append(self.url_result(talk_url, 'TED'))
1971 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1973 def _talk_info(self, url, video_id=0):
1974 """Return the video for the talk in the url"""
1975 m = re.match(self._VALID_URL, url,re.VERBOSE)
1976 video_name = m.group('name')
1977 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1978 self.report_extraction(video_name)
1979 # If the url includes the language we get the title translated
1980 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1982 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1983 webpage, 'json data')
1984 info = json.loads(json_data)
1985 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1986 webpage, 'description', flags = re.DOTALL)
1988 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1989 webpage, 'thumbnail')
1992 'url': info['htmlStreams'][-1]['file'],
1995 'thumbnail': thumbnail,
1996 'description': desc,
2000 class MySpassIE(InfoExtractor):
2001 _VALID_URL = r'http://www.myspass.de/.*'
2003 def _real_extract(self, url):
2004 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2006 # video id is the last path element of the URL
2007 # usually there is a trailing slash, so also try the second but last
2008 url_path = compat_urllib_parse_urlparse(url).path
2009 url_parent_path, video_id = os.path.split(url_path)
2011 _, video_id = os.path.split(url_parent_path)
2014 metadata_url = META_DATA_URL_TEMPLATE % video_id
2015 metadata_text = self._download_webpage(metadata_url, video_id)
2016 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2018 # extract values from metadata
2019 url_flv_el = metadata.find('url_flv')
2020 if url_flv_el is None:
2021 raise ExtractorError(u'Unable to extract download url')
2022 video_url = url_flv_el.text
2023 extension = os.path.splitext(video_url)[1][1:]
2024 title_el = metadata.find('title')
2025 if title_el is None:
2026 raise ExtractorError(u'Unable to extract title')
2027 title = title_el.text
2028 format_id_el = metadata.find('format_id')
2029 if format_id_el is None:
2032 format = format_id_el.text
2033 description_el = metadata.find('description')
2034 if description_el is not None:
2035 description = description_el.text
2038 imagePreview_el = metadata.find('imagePreview')
2039 if imagePreview_el is not None:
2040 thumbnail = imagePreview_el.text
2049 'thumbnail': thumbnail,
2050 'description': description
2054 class SpiegelIE(InfoExtractor):
2055 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2057 def _real_extract(self, url):
2058 m = re.match(self._VALID_URL, url)
2059 video_id = m.group('videoID')
2061 webpage = self._download_webpage(url, video_id)
2063 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2066 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2067 xml_code = self._download_webpage(xml_url, video_id,
2068 note=u'Downloading XML', errnote=u'Failed to download XML')
2070 idoc = xml.etree.ElementTree.fromstring(xml_code)
2071 last_type = idoc[-1]
2072 filename = last_type.findall('./filename')[0].text
2073 duration = float(last_type.findall('./duration')[0].text)
2075 video_url = 'http://video2.spiegel.de/flash/' + filename
2076 video_ext = filename.rpartition('.')[2]
2081 'title': video_title,
2082 'duration': duration,
2086 class LiveLeakIE(InfoExtractor):
2088 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2089 IE_NAME = u'liveleak'
2091 def _real_extract(self, url):
2092 mobj = re.match(self._VALID_URL, url)
2094 raise ExtractorError(u'Invalid URL: %s' % url)
2096 video_id = mobj.group('video_id')
2098 webpage = self._download_webpage(url, video_id)
2100 video_url = self._search_regex(r'file: "(.*?)",',
2101 webpage, u'video URL')
2103 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2104 webpage, u'title').replace('LiveLeak.com -', '').strip()
2106 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2107 webpage, u'description', fatal=False)
2109 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2110 webpage, u'uploader', fatal=False)
2116 'title': video_title,
2117 'description': video_description,
2118 'uploader': video_uploader
2125 class TumblrIE(InfoExtractor):
2126 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2128 def _real_extract(self, url):
2129 m_url = re.match(self._VALID_URL, url)
2130 video_id = m_url.group('id')
2131 blog = m_url.group('blog_name')
2133 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2134 webpage = self._download_webpage(url, video_id)
2136 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2137 video = re.search(re_video, webpage)
2139 raise ExtractorError(u'Unable to extract video')
2140 video_url = video.group('video_url')
2141 ext = video.group('ext')
2143 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2144 webpage, u'thumbnail', fatal=False) # We pick the first poster
2145 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2147 # The only place where you can get a title, it's not complete,
2148 # but searching in other places doesn't work for all videos
2149 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2150 webpage, u'title', flags=re.DOTALL)
2152 return [{'id': video_id,
2154 'title': video_title,
2155 'thumbnail': video_thumbnail,
2159 class BandcampIE(InfoExtractor):
2160 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2162 def _real_extract(self, url):
2163 mobj = re.match(self._VALID_URL, url)
2164 title = mobj.group('title')
2165 webpage = self._download_webpage(url, title)
2166 # We get the link to the free download page
2167 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2168 if m_download is None:
2169 raise ExtractorError(u'No free songs found')
2171 download_link = m_download.group(1)
2172 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2173 webpage, re.MULTILINE|re.DOTALL).group('id')
2175 download_webpage = self._download_webpage(download_link, id,
2176 'Downloading free downloads page')
2177 # We get the dictionary of the track from some javascrip code
2178 info = re.search(r'items: (.*?),$',
2179 download_webpage, re.MULTILINE).group(1)
2180 info = json.loads(info)[0]
2181 # We pick mp3-320 for now, until format selection can be easily implemented.
2182 mp3_info = info[u'downloads'][u'mp3-320']
2183 # If we try to use this url it says the link has expired
2184 initial_url = mp3_info[u'url']
2185 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2186 m_url = re.match(re_url, initial_url)
2187 #We build the url we will use to get the final track url
2188 # This url is build in Bandcamp in the script download_bunde_*.js
2189 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2190 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2191 # If we could correctly generate the .rand field the url would be
2192 #in the "download_url" key
2193 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2195 track_info = {'id':id,
2196 'title' : info[u'title'],
2199 'thumbnail' : info[u'thumb_url'],
2200 'uploader' : info[u'artist']
2205 class RedTubeIE(InfoExtractor):
2206 """Information Extractor for redtube"""
2207 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2209 def _real_extract(self,url):
2210 mobj = re.match(self._VALID_URL, url)
2212 raise ExtractorError(u'Invalid URL: %s' % url)
2214 video_id = mobj.group('id')
2215 video_extension = 'mp4'
2216 webpage = self._download_webpage(url, video_id)
2218 self.report_extraction(video_id)
2220 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2221 webpage, u'video URL')
2223 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2229 'ext': video_extension,
2230 'title': video_title,
2233 class InaIE(InfoExtractor):
2234 """Information Extractor for Ina.fr"""
2235 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2237 def _real_extract(self,url):
2238 mobj = re.match(self._VALID_URL, url)
2240 video_id = mobj.group('id')
2241 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2242 video_extension = 'mp4'
2243 webpage = self._download_webpage(mrss_url, video_id)
2245 self.report_extraction(video_id)
2247 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2248 webpage, u'video URL')
2250 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2256 'ext': video_extension,
2257 'title': video_title,
2260 class HowcastIE(InfoExtractor):
2261 """Information Extractor for Howcast.com"""
2262 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2264 def _real_extract(self, url):
2265 mobj = re.match(self._VALID_URL, url)
2267 video_id = mobj.group('id')
2268 webpage_url = 'http://www.howcast.com/videos/' + video_id
2269 webpage = self._download_webpage(webpage_url, video_id)
2271 self.report_extraction(video_id)
2273 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2274 webpage, u'video URL')
2276 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2279 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2280 webpage, u'description', fatal=False)
2282 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2283 webpage, u'thumbnail', fatal=False)
2289 'title': video_title,
2290 'description': video_description,
2291 'thumbnail': thumbnail,
2294 class VineIE(InfoExtractor):
2295 """Information Extractor for Vine.co"""
2296 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2298 def _real_extract(self, url):
2299 mobj = re.match(self._VALID_URL, url)
2301 video_id = mobj.group('id')
2302 webpage_url = 'https://vine.co/v/' + video_id
2303 webpage = self._download_webpage(webpage_url, video_id)
2305 self.report_extraction(video_id)
2307 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2308 webpage, u'video URL')
2310 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2313 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2314 webpage, u'thumbnail', fatal=False)
2316 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2317 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2323 'title': video_title,
2324 'thumbnail': thumbnail,
2325 'uploader': uploader,
2328 class FlickrIE(InfoExtractor):
2329 """Information Extractor for Flickr videos"""
2330 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2332 def _real_extract(self, url):
2333 mobj = re.match(self._VALID_URL, url)
2335 video_id = mobj.group('id')
2336 video_uploader_id = mobj.group('uploader_id')
2337 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2338 webpage = self._download_webpage(webpage_url, video_id)
2340 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2342 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2343 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2345 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2346 first_xml, u'node_id')
2348 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2349 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2351 self.report_extraction(video_id)
2353 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2355 raise ExtractorError(u'Unable to extract video url')
2356 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2358 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2359 webpage, u'video title')
2361 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2362 webpage, u'description', fatal=False)
2364 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2365 webpage, u'thumbnail', fatal=False)
2371 'title': video_title,
2372 'description': video_description,
2373 'thumbnail': thumbnail,
2374 'uploader_id': video_uploader_id,
2377 class TeamcocoIE(InfoExtractor):
2378 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2380 def _real_extract(self, url):
2381 mobj = re.match(self._VALID_URL, url)
2383 raise ExtractorError(u'Invalid URL: %s' % url)
2384 url_title = mobj.group('url_title')
2385 webpage = self._download_webpage(url, url_title)
2387 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2388 webpage, u'video id')
2390 self.report_extraction(video_id)
2392 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2395 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2396 webpage, u'thumbnail', fatal=False)
2398 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2399 webpage, u'description', fatal=False)
2401 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2402 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2404 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2411 'title': video_title,
2412 'thumbnail': thumbnail,
2413 'description': video_description,
2416 class XHamsterIE(InfoExtractor):
2417 """Information Extractor for xHamster"""
2418 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2420 def _real_extract(self,url):
2421 mobj = re.match(self._VALID_URL, url)
2423 video_id = mobj.group('id')
2424 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2425 webpage = self._download_webpage(mrss_url, video_id)
2427 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2429 raise ExtractorError(u'Unable to extract media URL')
2430 if len(mobj.group('server')) == 0:
2431 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2433 video_url = mobj.group('server')+'/key='+mobj.group('file')
2434 video_extension = video_url.split('.')[-1]
2436 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2439 # Can't see the description anywhere in the UI
2440 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2441 # webpage, u'description', fatal=False)
2442 # if video_description: video_description = unescapeHTML(video_description)
2444 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2446 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2448 video_upload_date = None
2449 self._downloader.report_warning(u'Unable to extract upload date')
2451 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2452 webpage, u'uploader id', default=u'anonymous')
2454 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2455 webpage, u'thumbnail', fatal=False)
2460 'ext': video_extension,
2461 'title': video_title,
2462 # 'description': video_description,
2463 'upload_date': video_upload_date,
2464 'uploader_id': video_uploader_id,
2465 'thumbnail': video_thumbnail
2468 class HypemIE(InfoExtractor):
2469 """Information Extractor for hypem"""
2470 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2472 def _real_extract(self, url):
2473 mobj = re.match(self._VALID_URL, url)
2475 raise ExtractorError(u'Invalid URL: %s' % url)
2476 track_id = mobj.group(1)
2478 data = { 'ax': 1, 'ts': time.time() }
2479 data_encoded = compat_urllib_parse.urlencode(data)
2480 complete_url = url + "?" + data_encoded
2481 request = compat_urllib_request.Request(complete_url)
2482 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2483 cookie = urlh.headers.get('Set-Cookie', '')
2485 self.report_extraction(track_id)
2487 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2488 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2490 track_list = json.loads(html_tracks)
2491 track = track_list[u'tracks'][0]
2493 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2496 track_id = track[u"id"]
2497 artist = track[u"artist"]
2498 title = track[u"song"]
2500 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2501 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2502 request.add_header('cookie', cookie)
2503 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2505 song_data = json.loads(song_data_json)
2507 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2508 final_url = song_data[u"url"]
2518 class Vbox7IE(InfoExtractor):
2519 """Information Extractor for Vbox7"""
2520 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2522 def _real_extract(self,url):
2523 mobj = re.match(self._VALID_URL, url)
2525 raise ExtractorError(u'Invalid URL: %s' % url)
2526 video_id = mobj.group(1)
2528 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2529 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2530 redirect_url = urlh.geturl() + new_location
2531 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2533 title = self._html_search_regex(r'<title>(.*)</title>',
2534 webpage, u'title').split('/')[0].strip()
2537 info_url = "http://vbox7.com/play/magare.do"
2538 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2539 info_request = compat_urllib_request.Request(info_url, data)
2540 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2541 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2542 if info_response is None:
2543 raise ExtractorError(u'Unable to extract the media url')
2544 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2551 'thumbnail': thumbnail_url,
2555 def gen_extractors():
2556 """ Return a list of an instance of every supported extractor.
2557 The order does matter; the first extractor matched is the one handling the URL.
2560 YoutubePlaylistIE(),
2585 StanfordOpenClassroomIE(),
2595 WorldStarHipHopIE(),
2625 def get_info_extractor(ie_name):
2626 """Returns the info extractor class with the given ie_name"""
2627 return globals()[ie_name+'IE']