10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.dailymotion import DailymotionIE
25 from .extractor.gametrailers import GametrailersIE
26 from .extractor.generic import GenericIE
27 from .extractor.metacafe import MetacafeIE
28 from .extractor.myvideo import MyVideoIE
29 from .extractor.statigram import StatigramIE
30 from .extractor.photobucket import PhotobucketIE
31 from .extractor.vimeo import VimeoIE
32 from .extractor.yahoo import YahooIE, YahooSearchIE
33 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
34 from .extractor.zdf import ZDFIE
54 class DepositFilesIE(InfoExtractor):
55 """Information extractor for depositfiles.com"""
57 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
59 def _real_extract(self, url):
60 file_id = url.split('/')[-1]
61 # Rebuild url in english locale
62 url = 'http://depositfiles.com/en/files/' + file_id
64 # Retrieve file webpage with 'Free download' button pressed
65 free_download_indication = { 'gateway_result' : '1' }
66 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
68 self.report_download_webpage(file_id)
69 webpage = compat_urllib_request.urlopen(request).read()
70 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
71 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
73 # Search for the real file URL
74 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
75 if (mobj is None) or (mobj.group(1) is None):
76 # Try to figure out reason of the error.
77 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
78 if (mobj is not None) and (mobj.group(1) is not None):
79 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
80 raise ExtractorError(u'%s' % restriction_message)
82 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
84 file_url = mobj.group(1)
85 file_extension = os.path.splitext(file_url)[1][1:]
87 # Search for file title
88 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
91 'id': file_id.decode('utf-8'),
92 'url': file_url.decode('utf-8'),
96 'ext': file_extension.decode('utf-8'),
100 class FacebookIE(InfoExtractor):
101 """Information Extractor for Facebook"""
103 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
104 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
105 _NETRC_MACHINE = 'facebook'
106 IE_NAME = u'facebook'
108 def report_login(self):
109 """Report attempt to log in."""
110 self.to_screen(u'Logging in')
112 def _real_initialize(self):
113 if self._downloader is None:
118 downloader_params = self._downloader.params
120 # Attempt to use provided username and password or .netrc data
121 if downloader_params.get('username', None) is not None:
122 useremail = downloader_params['username']
123 password = downloader_params['password']
124 elif downloader_params.get('usenetrc', False):
126 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
131 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
132 except (IOError, netrc.NetrcParseError) as err:
133 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
136 if useremail is None:
145 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
148 login_results = compat_urllib_request.urlopen(request).read()
149 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
150 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
152 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
153 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
156 def _real_extract(self, url):
157 mobj = re.match(self._VALID_URL, url)
159 raise ExtractorError(u'Invalid URL: %s' % url)
160 video_id = mobj.group('ID')
162 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
163 webpage = self._download_webpage(url, video_id)
165 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
166 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
167 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
169 raise ExtractorError(u'Cannot parse data')
170 data = dict(json.loads(m.group(1)))
171 params_raw = compat_urllib_parse.unquote(data['params'])
172 params = json.loads(params_raw)
173 video_data = params['video_data'][0]
174 video_url = video_data.get('hd_src')
176 video_url = video_data['sd_src']
178 raise ExtractorError(u'Cannot find video URL')
179 video_duration = int(video_data['video_duration'])
180 thumbnail = video_data['thumbnail_src']
182 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
187 'title': video_title,
190 'duration': video_duration,
191 'thumbnail': thumbnail,
199 class ComedyCentralIE(InfoExtractor):
200 """Information extractor for The Daily Show and Colbert Report """
202 # urls can be abbreviations like :thedailyshow or :colbert
203 # urls for episodes like:
204 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
205 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
206 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
207 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
208 |(https?://)?(www\.)?
209 (?P<showname>thedailyshow|colbertnation)\.com/
210 (full-episodes/(?P<episode>.*)|
212 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
213 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
216 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
218 _video_extensions = {
226 _video_dimensions = {
236 def suitable(cls, url):
237 """Receives a URL and returns True if suitable for this IE."""
238 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
240 def _print_formats(self, formats):
241 print('Available formats:')
243 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
246 def _real_extract(self, url):
247 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
249 raise ExtractorError(u'Invalid URL: %s' % url)
251 if mobj.group('shortname'):
252 if mobj.group('shortname') in ('tds', 'thedailyshow'):
253 url = u'http://www.thedailyshow.com/full-episodes/'
255 url = u'http://www.colbertnation.com/full-episodes/'
256 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
257 assert mobj is not None
259 if mobj.group('clip'):
260 if mobj.group('showname') == 'thedailyshow':
261 epTitle = mobj.group('tdstitle')
263 epTitle = mobj.group('cntitle')
266 dlNewest = not mobj.group('episode')
268 epTitle = mobj.group('showname')
270 epTitle = mobj.group('episode')
272 self.report_extraction(epTitle)
273 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
275 url = htmlHandle.geturl()
276 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
278 raise ExtractorError(u'Invalid redirected URL: ' + url)
279 if mobj.group('episode') == '':
280 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
281 epTitle = mobj.group('episode')
283 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
285 if len(mMovieParams) == 0:
286 # The Colbert Report embeds the information in a without
287 # a URL prefix; so extract the alternate reference
288 # and then add the URL prefix manually.
290 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
291 if len(altMovieParams) == 0:
292 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
294 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
296 uri = mMovieParams[0][1]
297 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
298 indexXml = self._download_webpage(indexUrl, epTitle,
299 u'Downloading show index',
300 u'unable to download episode index')
304 idoc = xml.etree.ElementTree.fromstring(indexXml)
305 itemEls = idoc.findall('.//item')
306 for partNum,itemEl in enumerate(itemEls):
307 mediaId = itemEl.findall('./guid')[0].text
308 shortMediaId = mediaId.split(':')[-1]
309 showId = mediaId.split(':')[-2].replace('.com', '')
310 officialTitle = itemEl.findall('./title')[0].text
311 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
313 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
314 compat_urllib_parse.urlencode({'uri': mediaId}))
315 configXml = self._download_webpage(configUrl, epTitle,
316 u'Downloading configuration for %s' % shortMediaId)
318 cdoc = xml.etree.ElementTree.fromstring(configXml)
320 for rendition in cdoc.findall('.//rendition'):
321 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
325 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
328 if self._downloader.params.get('listformats', None):
329 self._print_formats([i[0] for i in turls])
332 # For now, just pick the highest bitrate
333 format,rtmp_video_url = turls[-1]
335 # Get the format arg from the arg stream
336 req_format = self._downloader.params.get('format', None)
338 # Select format if we can find one
341 format, rtmp_video_url = f, v
344 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
346 raise ExtractorError(u'Cannot transform RTMP url')
347 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
348 video_url = base + m.group('finalid')
350 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
355 'upload_date': officialDate,
360 'description': officialTitle,
367 class EscapistIE(InfoExtractor):
368 """Information extractor for The Escapist """
370 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
371 IE_NAME = u'escapist'
373 def _real_extract(self, url):
374 mobj = re.match(self._VALID_URL, url)
376 raise ExtractorError(u'Invalid URL: %s' % url)
377 showName = mobj.group('showname')
378 videoId = mobj.group('episode')
380 self.report_extraction(videoId)
381 webpage = self._download_webpage(url, videoId)
383 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
384 webpage, u'description', fatal=False)
386 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
387 webpage, u'thumbnail', fatal=False)
389 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
390 webpage, u'player url')
392 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
393 webpage, u'player url').split(' : ')[-1]
395 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
396 configUrl = compat_urllib_parse.unquote(configUrl)
398 configJSON = self._download_webpage(configUrl, videoId,
399 u'Downloading configuration',
400 u'unable to download configuration')
402 # Technically, it's JavaScript, not JSON
403 configJSON = configJSON.replace("'", '"')
406 config = json.loads(configJSON)
407 except (ValueError,) as err:
408 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
410 playlist = config['playlist']
411 videoUrl = playlist[1]['url']
416 'uploader': showName,
421 'description': videoDesc,
422 'player_url': playerUrl,
427 class CollegeHumorIE(InfoExtractor):
428 """Information extractor for collegehumor.com"""
431 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
432 IE_NAME = u'collegehumor'
434 def report_manifest(self, video_id):
435 """Report information extraction."""
436 self.to_screen(u'%s: Downloading XML manifest' % video_id)
438 def _real_extract(self, url):
439 mobj = re.match(self._VALID_URL, url)
441 raise ExtractorError(u'Invalid URL: %s' % url)
442 video_id = mobj.group('videoid')
450 self.report_extraction(video_id)
451 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
453 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
454 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
455 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
457 mdoc = xml.etree.ElementTree.fromstring(metaXml)
459 videoNode = mdoc.findall('./video')[0]
460 info['description'] = videoNode.findall('./description')[0].text
461 info['title'] = videoNode.findall('./caption')[0].text
462 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
463 manifest_url = videoNode.findall('./file')[0].text
465 raise ExtractorError(u'Invalid metadata XML file')
467 manifest_url += '?hdcore=2.10.3'
468 self.report_manifest(video_id)
470 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
471 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
472 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
474 adoc = xml.etree.ElementTree.fromstring(manifestXml)
476 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
477 node_id = media_node.attrib['url']
478 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
479 except IndexError as err:
480 raise ExtractorError(u'Invalid manifest file')
482 url_pr = compat_urllib_parse_urlparse(manifest_url)
483 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
490 class XVideosIE(InfoExtractor):
491 """Information extractor for xvideos.com"""
493 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
496 def _real_extract(self, url):
497 mobj = re.match(self._VALID_URL, url)
499 raise ExtractorError(u'Invalid URL: %s' % url)
500 video_id = mobj.group(1)
502 webpage = self._download_webpage(url, video_id)
504 self.report_extraction(video_id)
507 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
508 webpage, u'video URL'))
511 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
514 # Extract video thumbnail
515 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
516 webpage, u'thumbnail', fatal=False)
523 'title': video_title,
525 'thumbnail': video_thumbnail,
532 class SoundcloudIE(InfoExtractor):
533 """Information extractor for soundcloud.com
534 To access the media, the uid of the song and a stream token
535 must be extracted from the page source and the script must make
536 a request to media.soundcloud.com/crossdomain.xml. Then
537 the media can be grabbed by requesting from an url composed
538 of the stream token and uid
541 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
542 IE_NAME = u'soundcloud'
544 def report_resolve(self, video_id):
545 """Report information extraction."""
546 self.to_screen(u'%s: Resolving id' % video_id)
548 def _real_extract(self, url):
549 mobj = re.match(self._VALID_URL, url)
551 raise ExtractorError(u'Invalid URL: %s' % url)
553 # extract uploader (which is in the url)
554 uploader = mobj.group(1)
555 # extract simple title (uploader + slug of song title)
556 slug_title = mobj.group(2)
557 simple_title = uploader + u'-' + slug_title
558 full_title = '%s/%s' % (uploader, slug_title)
560 self.report_resolve(full_title)
562 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
563 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
564 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
566 info = json.loads(info_json)
567 video_id = info['id']
568 self.report_extraction(full_title)
570 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
571 stream_json = self._download_webpage(streams_url, full_title,
572 u'Downloading stream definitions',
573 u'unable to download stream definitions')
575 streams = json.loads(stream_json)
576 mediaURL = streams['http_mp3_128_url']
577 upload_date = unified_strdate(info['created_at'])
582 'uploader': info['user']['username'],
583 'upload_date': upload_date,
584 'title': info['title'],
586 'description': info['description'],
589 class SoundcloudSetIE(InfoExtractor):
590 """Information extractor for soundcloud.com sets
591 To access the media, the uid of the song and a stream token
592 must be extracted from the page source and the script must make
593 a request to media.soundcloud.com/crossdomain.xml. Then
594 the media can be grabbed by requesting from an url composed
595 of the stream token and uid
598 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
599 IE_NAME = u'soundcloud:set'
601 def report_resolve(self, video_id):
602 """Report information extraction."""
603 self.to_screen(u'%s: Resolving id' % video_id)
605 def _real_extract(self, url):
606 mobj = re.match(self._VALID_URL, url)
608 raise ExtractorError(u'Invalid URL: %s' % url)
610 # extract uploader (which is in the url)
611 uploader = mobj.group(1)
612 # extract simple title (uploader + slug of song title)
613 slug_title = mobj.group(2)
614 simple_title = uploader + u'-' + slug_title
615 full_title = '%s/sets/%s' % (uploader, slug_title)
617 self.report_resolve(full_title)
619 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
620 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
621 info_json = self._download_webpage(resolv_url, full_title)
624 info = json.loads(info_json)
626 for err in info['errors']:
627 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
630 self.report_extraction(full_title)
631 for track in info['tracks']:
632 video_id = track['id']
634 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
635 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
637 self.report_extraction(video_id)
638 streams = json.loads(stream_json)
639 mediaURL = streams['http_mp3_128_url']
644 'uploader': track['user']['username'],
645 'upload_date': unified_strdate(track['created_at']),
646 'title': track['title'],
648 'description': track['description'],
653 class InfoQIE(InfoExtractor):
654 """Information extractor for infoq.com"""
655 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
657 def _real_extract(self, url):
658 mobj = re.match(self._VALID_URL, url)
660 raise ExtractorError(u'Invalid URL: %s' % url)
662 webpage = self._download_webpage(url, video_id=url)
663 self.report_extraction(url)
666 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
668 raise ExtractorError(u'Unable to extract video url')
669 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
670 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
673 video_title = self._search_regex(r'contentTitle = "(.*?)";',
676 # Extract description
677 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
678 webpage, u'description', fatal=False)
680 video_filename = video_url.split('/')[-1]
681 video_id, extension = video_filename.split('.')
688 'title': video_title,
689 'ext': extension, # Extension is always(?) mp4, but seems to be flv
691 'description': video_description,
696 class MixcloudIE(InfoExtractor):
697 """Information extractor for www.mixcloud.com"""
699 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
700 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
701 IE_NAME = u'mixcloud'
703 def report_download_json(self, file_id):
704 """Report JSON download."""
705 self.to_screen(u'Downloading json')
707 def get_urls(self, jsonData, fmt, bitrate='best'):
708 """Get urls from 'audio_formats' section in json"""
711 bitrate_list = jsonData[fmt]
712 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
713 bitrate = max(bitrate_list) # select highest
715 url_list = jsonData[fmt][bitrate]
716 except TypeError: # we have no bitrate info.
717 url_list = jsonData[fmt]
720 def check_urls(self, url_list):
721 """Returns 1st active url from list"""
724 compat_urllib_request.urlopen(url)
726 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
731 def _print_formats(self, formats):
732 print('Available formats:')
733 for fmt in formats.keys():
734 for b in formats[fmt]:
736 ext = formats[fmt][b][0]
737 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
738 except TypeError: # we have no bitrate info
739 ext = formats[fmt][0]
740 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
743 def _real_extract(self, url):
744 mobj = re.match(self._VALID_URL, url)
746 raise ExtractorError(u'Invalid URL: %s' % url)
747 # extract uploader & filename from url
748 uploader = mobj.group(1).decode('utf-8')
749 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
751 # construct API request
752 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
753 # retrieve .json file with links to files
754 request = compat_urllib_request.Request(file_url)
756 self.report_download_json(file_url)
757 jsonData = compat_urllib_request.urlopen(request).read()
758 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
759 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
762 json_data = json.loads(jsonData)
763 player_url = json_data['player_swf_url']
764 formats = dict(json_data['audio_formats'])
766 req_format = self._downloader.params.get('format', None)
769 if self._downloader.params.get('listformats', None):
770 self._print_formats(formats)
773 if req_format is None or req_format == 'best':
774 for format_param in formats.keys():
775 url_list = self.get_urls(formats, format_param)
777 file_url = self.check_urls(url_list)
778 if file_url is not None:
781 if req_format not in formats:
782 raise ExtractorError(u'Format is not available')
784 url_list = self.get_urls(formats, req_format)
785 file_url = self.check_urls(url_list)
786 format_param = req_format
789 'id': file_id.decode('utf-8'),
790 'url': file_url.decode('utf-8'),
791 'uploader': uploader.decode('utf-8'),
793 'title': json_data['name'],
794 'ext': file_url.split('.')[-1].decode('utf-8'),
795 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
796 'thumbnail': json_data['thumbnail_url'],
797 'description': json_data['description'],
798 'player_url': player_url.decode('utf-8'),
801 class StanfordOpenClassroomIE(InfoExtractor):
802 """Information extractor for Stanford's Open ClassRoom"""
804 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
805 IE_NAME = u'stanfordoc'
807 def _real_extract(self, url):
808 mobj = re.match(self._VALID_URL, url)
810 raise ExtractorError(u'Invalid URL: %s' % url)
812 if mobj.group('course') and mobj.group('video'): # A specific video
813 course = mobj.group('course')
814 video = mobj.group('video')
816 'id': course + '_' + video,
821 self.report_extraction(info['id'])
822 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
823 xmlUrl = baseUrl + video + '.xml'
825 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
826 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
827 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
828 mdoc = xml.etree.ElementTree.fromstring(metaXml)
830 info['title'] = mdoc.findall('./title')[0].text
831 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
833 raise ExtractorError(u'Invalid metadata XML file')
834 info['ext'] = info['url'].rpartition('.')[2]
836 elif mobj.group('course'): # A course page
837 course = mobj.group('course')
845 coursepage = self._download_webpage(url, info['id'],
846 note='Downloading course info page',
847 errnote='Unable to download course info page')
849 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
851 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
852 coursepage, u'description', fatal=False)
854 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
858 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
862 for entry in info['list']:
863 assert entry['type'] == 'reference'
864 results += self.extract(entry['url'])
868 'id': 'Stanford OpenClassroom',
874 self.report_download_webpage(info['id'])
875 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
877 rootpage = compat_urllib_request.urlopen(rootURL).read()
878 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
879 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
881 info['title'] = info['id']
883 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
887 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
892 for entry in info['list']:
893 assert entry['type'] == 'reference'
894 results += self.extract(entry['url'])
897 class MTVIE(InfoExtractor):
898 """Information extractor for MTV.com"""
900 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
903 def _real_extract(self, url):
904 mobj = re.match(self._VALID_URL, url)
906 raise ExtractorError(u'Invalid URL: %s' % url)
907 if not mobj.group('proto'):
908 url = 'http://' + url
909 video_id = mobj.group('videoid')
911 webpage = self._download_webpage(url, video_id)
913 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
914 webpage, u'song name', fatal=False)
916 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
919 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
920 webpage, u'mtvn_uri', fatal=False)
922 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
923 webpage, u'content id', fatal=False)
925 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
926 self.report_extraction(video_id)
927 request = compat_urllib_request.Request(videogen_url)
929 metadataXml = compat_urllib_request.urlopen(request).read()
930 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
931 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
933 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
934 renditions = mdoc.findall('.//rendition')
936 # For now, always pick the highest quality.
937 rendition = renditions[-1]
940 _,_,ext = rendition.attrib['type'].partition('/')
941 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
942 video_url = rendition.find('./src').text
944 raise ExtractorError('Invalid rendition field.')
949 'uploader': performer,
951 'title': video_title,
959 class YoukuIE(InfoExtractor):
960 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
963 nowTime = int(time.time() * 1000)
964 random1 = random.randint(1000,1998)
965 random2 = random.randint(1000,9999)
967 return "%d%d%d" %(nowTime,random1,random2)
969 def _get_file_ID_mix_string(self, seed):
971 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
973 for i in range(len(source)):
974 seed = (seed * 211 + 30031 ) % 65536
975 index = math.floor(seed / 65536 * len(source) )
976 mixed.append(source[int(index)])
977 source.remove(source[int(index)])
978 #return ''.join(mixed)
981 def _get_file_id(self, fileId, seed):
982 mixed = self._get_file_ID_mix_string(seed)
983 ids = fileId.split('*')
987 realId.append(mixed[int(ch)])
988 return ''.join(realId)
990 def _real_extract(self, url):
991 mobj = re.match(self._VALID_URL, url)
993 raise ExtractorError(u'Invalid URL: %s' % url)
994 video_id = mobj.group('ID')
996 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
998 jsondata = self._download_webpage(info_url, video_id)
1000 self.report_extraction(video_id)
1002 config = json.loads(jsondata)
1004 video_title = config['data'][0]['title']
1005 seed = config['data'][0]['seed']
1007 format = self._downloader.params.get('format', None)
1008 supported_format = list(config['data'][0]['streamfileids'].keys())
1010 if format is None or format == 'best':
1011 if 'hd2' in supported_format:
1016 elif format == 'worst':
1024 fileid = config['data'][0]['streamfileids'][format]
1025 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1026 except (UnicodeDecodeError, ValueError, KeyError):
1027 raise ExtractorError(u'Unable to extract info section')
1030 sid = self._gen_sid()
1031 fileid = self._get_file_id(fileid, seed)
1033 #column 8,9 of fileid represent the segment number
1034 #fileid[7:9] should be changed
1035 for index, key in enumerate(keys):
1037 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1038 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1041 'id': '%s_part%02d' % (video_id, index),
1042 'url': download_url,
1044 'upload_date': None,
1045 'title': video_title,
1048 files_info.append(info)
1053 class XNXXIE(InfoExtractor):
1054 """Information extractor for xnxx.com"""
1056 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1058 VIDEO_URL_RE = r'flv_url=(.*?)&'
1059 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1060 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
1062 def _real_extract(self, url):
1063 mobj = re.match(self._VALID_URL, url)
1065 raise ExtractorError(u'Invalid URL: %s' % url)
1066 video_id = mobj.group(1)
1068 # Get webpage content
1069 webpage = self._download_webpage(url, video_id)
1071 video_url = self._search_regex(self.VIDEO_URL_RE,
1072 webpage, u'video URL')
1073 video_url = compat_urllib_parse.unquote(video_url)
1075 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1078 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1079 webpage, u'thumbnail', fatal=False)
1085 'upload_date': None,
1086 'title': video_title,
1088 'thumbnail': video_thumbnail,
1089 'description': None,
1093 class GooglePlusIE(InfoExtractor):
1094 """Information extractor for plus.google.com."""
1096 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1097 IE_NAME = u'plus.google'
1099 def _real_extract(self, url):
1100 # Extract id from URL
1101 mobj = re.match(self._VALID_URL, url)
1103 raise ExtractorError(u'Invalid URL: %s' % url)
1105 post_url = mobj.group(0)
1106 video_id = mobj.group(1)
1108 video_extension = 'flv'
1110 # Step 1, Retrieve post webpage to extract further information
1111 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1113 self.report_extraction(video_id)
1115 # Extract update date
1116 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1117 webpage, u'upload date', fatal=False)
1119 # Convert timestring to a format suitable for filename
1120 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1121 upload_date = upload_date.strftime('%Y%m%d')
1124 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1125 webpage, u'uploader', fatal=False)
1128 # Get the first line for title
1129 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1130 webpage, 'title', default=u'NA')
1132 # Step 2, Stimulate clicking the image box to launch video
1133 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1134 webpage, u'video page URL')
1135 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1137 # Extract video links on video page
1138 """Extract video links of all sizes"""
1139 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1140 mobj = re.findall(pattern, webpage)
1142 raise ExtractorError(u'Unable to extract video links')
1144 # Sort in resolution
1145 links = sorted(mobj)
1147 # Choose the lowest of the sort, i.e. highest resolution
1148 video_url = links[-1]
1149 # Only get the url. The resolution part in the tuple has no use anymore
1150 video_url = video_url[-1]
1151 # Treat escaped \u0026 style hex
1153 video_url = video_url.decode("unicode_escape")
1154 except AttributeError: # Python 3
1155 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1161 'uploader': uploader,
1162 'upload_date': upload_date,
1163 'title': video_title,
1164 'ext': video_extension,
1167 class NBAIE(InfoExtractor):
1168 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1171 def _real_extract(self, url):
1172 mobj = re.match(self._VALID_URL, url)
1174 raise ExtractorError(u'Invalid URL: %s' % url)
1176 video_id = mobj.group(1)
1178 webpage = self._download_webpage(url, video_id)
1180 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1182 shortened_video_id = video_id.rpartition('/')[2]
1183 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1184 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1186 # It isn't there in the HTML it returns to us
1187 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1189 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1192 'id': shortened_video_id,
1196 # 'uploader_date': uploader_date,
1197 'description': description,
1201 class JustinTVIE(InfoExtractor):
1202 """Information extractor for justin.tv and twitch.tv"""
1203 # TODO: One broadcast may be split into multiple videos. The key
1204 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1205 # starts at 1 and increases. Can we treat all parts as one video?
1207 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1209 (?P<channelid>[^/]+)|
1210 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1211 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1215 _JUSTIN_PAGE_LIMIT = 100
1216 IE_NAME = u'justin.tv'
1218 def report_download_page(self, channel, offset):
1219 """Report attempt to download a single page of videos."""
1220 self.to_screen(u'%s: Downloading video information from %d to %d' %
1221 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1223 # Return count of items, list of *valid* items
1224 def _parse_page(self, url, video_id):
1225 webpage = self._download_webpage(url, video_id,
1226 u'Downloading video info JSON',
1227 u'unable to download video info JSON')
1229 response = json.loads(webpage)
1230 if type(response) != list:
1231 error_text = response.get('error', 'unknown error')
1232 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1234 for clip in response:
1235 video_url = clip['video_file_url']
1237 video_extension = os.path.splitext(video_url)[1][1:]
1238 video_date = re.sub('-', '', clip['start_time'][:10])
1239 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1240 video_id = clip['id']
1241 video_title = clip.get('title', video_id)
1245 'title': video_title,
1246 'uploader': clip.get('channel_name', video_uploader_id),
1247 'uploader_id': video_uploader_id,
1248 'upload_date': video_date,
1249 'ext': video_extension,
1251 return (len(response), info)
1253 def _real_extract(self, url):
1254 mobj = re.match(self._VALID_URL, url)
1256 raise ExtractorError(u'invalid URL: %s' % url)
1258 api_base = 'http://api.justin.tv'
1260 if mobj.group('channelid'):
1262 video_id = mobj.group('channelid')
1263 api = api_base + '/channel/archives/%s.json' % video_id
1264 elif mobj.group('chapterid'):
1265 chapter_id = mobj.group('chapterid')
1267 webpage = self._download_webpage(url, chapter_id)
1268 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1270 raise ExtractorError(u'Cannot find archive of a chapter')
1271 archive_id = m.group(1)
1273 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1274 chapter_info_xml = self._download_webpage(api, chapter_id,
1275 note=u'Downloading chapter information',
1276 errnote=u'Chapter information download failed')
1277 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1278 for a in doc.findall('.//archive'):
1279 if archive_id == a.find('./id').text:
1282 raise ExtractorError(u'Could not find chapter in chapter information')
1284 video_url = a.find('./video_file_url').text
1285 video_ext = video_url.rpartition('.')[2] or u'flv'
1287 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1288 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1289 note='Downloading chapter metadata',
1290 errnote='Download of chapter metadata failed')
1291 chapter_info = json.loads(chapter_info_json)
1293 bracket_start = int(doc.find('.//bracket_start').text)
1294 bracket_end = int(doc.find('.//bracket_end').text)
1296 # TODO determine start (and probably fix up file)
1297 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1298 #video_url += u'?start=' + TODO:start_timestamp
1299 # bracket_start is 13290, but we want 51670615
1300 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1301 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1304 'id': u'c' + chapter_id,
1307 'title': chapter_info['title'],
1308 'thumbnail': chapter_info['preview'],
1309 'description': chapter_info['description'],
1310 'uploader': chapter_info['channel']['display_name'],
1311 'uploader_id': chapter_info['channel']['name'],
1315 video_id = mobj.group('videoid')
1316 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1318 self.report_extraction(video_id)
1322 limit = self._JUSTIN_PAGE_LIMIT
1325 self.report_download_page(video_id, offset)
1326 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1327 page_count, page_info = self._parse_page(page_url, video_id)
1328 info.extend(page_info)
1329 if not paged or page_count != limit:
1334 class FunnyOrDieIE(InfoExtractor):
1335 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1337 def _real_extract(self, url):
1338 mobj = re.match(self._VALID_URL, url)
1340 raise ExtractorError(u'invalid URL: %s' % url)
1342 video_id = mobj.group('id')
1343 webpage = self._download_webpage(url, video_id)
1345 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1346 webpage, u'video URL', flags=re.DOTALL)
1348 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1349 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1351 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1352 webpage, u'description', fatal=False, flags=re.DOTALL)
1359 'description': video_description,
1363 class SteamIE(InfoExtractor):
1364 _VALID_URL = r"""http://store\.steampowered\.com/
1366 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1368 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1370 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1371 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1374 def suitable(cls, url):
1375 """Receives a URL and returns True if suitable for this IE."""
1376 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1378 def _real_extract(self, url):
1379 m = re.match(self._VALID_URL, url, re.VERBOSE)
1380 gameID = m.group('gameID')
1382 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1383 webpage = self._download_webpage(videourl, gameID)
1385 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1386 videourl = self._AGECHECK_TEMPLATE % gameID
1387 self.report_age_confirmation()
1388 webpage = self._download_webpage(videourl, gameID)
1390 self.report_extraction(gameID)
1391 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1392 webpage, 'game title')
1394 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1395 mweb = re.finditer(urlRE, webpage)
1396 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1397 titles = re.finditer(namesRE, webpage)
1398 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1399 thumbs = re.finditer(thumbsRE, webpage)
1401 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1402 video_id = vid.group('videoID')
1403 title = vtitle.group('videoName')
1404 video_url = vid.group('videoURL')
1405 video_thumb = thumb.group('thumbnail')
1407 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1412 'title': unescapeHTML(title),
1413 'thumbnail': video_thumb
1416 return [self.playlist_result(videos, gameID, game_title)]
1418 class UstreamIE(InfoExtractor):
1419 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1420 IE_NAME = u'ustream'
1422 def _real_extract(self, url):
1423 m = re.match(self._VALID_URL, url)
1424 video_id = m.group('videoID')
1426 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1427 webpage = self._download_webpage(url, video_id)
1429 self.report_extraction(video_id)
1431 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1434 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1435 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1437 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1438 webpage, u'thumbnail', fatal=False)
1444 'title': video_title,
1445 'uploader': uploader,
1446 'thumbnail': thumbnail,
1450 class WorldStarHipHopIE(InfoExtractor):
1451 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1452 IE_NAME = u'WorldStarHipHop'
1454 def _real_extract(self, url):
1455 m = re.match(self._VALID_URL, url)
1456 video_id = m.group('id')
1458 webpage_src = self._download_webpage(url, video_id)
1460 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1461 webpage_src, u'video URL')
1463 if 'mp4' in video_url:
1468 video_title = self._html_search_regex(r"<title>(.*)</title>",
1469 webpage_src, u'title')
1471 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1472 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1473 webpage_src, u'thumbnail', fatal=False)
1476 _title = r"""candytitles.*>(.*)</span>"""
1477 mobj = re.search(_title, webpage_src)
1478 if mobj is not None:
1479 video_title = mobj.group(1)
1484 'title' : video_title,
1485 'thumbnail' : thumbnail,
1490 class RBMARadioIE(InfoExtractor):
1491 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1493 def _real_extract(self, url):
1494 m = re.match(self._VALID_URL, url)
1495 video_id = m.group('videoID')
1497 webpage = self._download_webpage(url, video_id)
1499 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1500 webpage, u'json data', flags=re.MULTILINE)
1503 data = json.loads(json_data)
1504 except ValueError as e:
1505 raise ExtractorError(u'Invalid JSON: ' + str(e))
1507 video_url = data['akamai_url'] + '&cbr=256'
1508 url_parts = compat_urllib_parse_urlparse(video_url)
1509 video_ext = url_parts.path.rpartition('.')[2]
1514 'title': data['title'],
1515 'description': data.get('teaser_text'),
1516 'location': data.get('country_of_origin'),
1517 'uploader': data.get('host', {}).get('name'),
1518 'uploader_id': data.get('host', {}).get('slug'),
1519 'thumbnail': data.get('image', {}).get('large_url_2x'),
1520 'duration': data.get('duration'),
1525 class YouPornIE(InfoExtractor):
1526 """Information extractor for youporn.com."""
1527 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1529 def _print_formats(self, formats):
1530 """Print all available formats"""
1531 print(u'Available formats:')
1532 print(u'ext\t\tformat')
1533 print(u'---------------------------------')
1534 for format in formats:
1535 print(u'%s\t\t%s' % (format['ext'], format['format']))
1537 def _specific(self, req_format, formats):
1539 if(x["format"]==req_format):
1543 def _real_extract(self, url):
1544 mobj = re.match(self._VALID_URL, url)
1546 raise ExtractorError(u'Invalid URL: %s' % url)
1547 video_id = mobj.group('videoid')
1549 req = compat_urllib_request.Request(url)
1550 req.add_header('Cookie', 'age_verified=1')
1551 webpage = self._download_webpage(req, video_id)
1553 # Get JSON parameters
1554 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1556 params = json.loads(json_params)
1558 raise ExtractorError(u'Invalid JSON')
1560 self.report_extraction(video_id)
1562 video_title = params['title']
1563 upload_date = unified_strdate(params['release_date_f'])
1564 video_description = params['description']
1565 video_uploader = params['submitted_by']
1566 thumbnail = params['thumbnails'][0]['image']
1568 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1570 # Get all of the formats available
1571 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1572 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1573 webpage, u'download list').strip()
1575 # Get all of the links from the page
1576 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1577 links = re.findall(LINK_RE, download_list_html)
1578 if(len(links) == 0):
1579 raise ExtractorError(u'ERROR: no known formats available for video')
1581 self.to_screen(u'Links found: %d' % len(links))
1586 # A link looks like this:
1587 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1588 # A path looks like this:
1589 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1590 video_url = unescapeHTML( link )
1591 path = compat_urllib_parse_urlparse( video_url ).path
1592 extension = os.path.splitext( path )[1][1:]
1593 format = path.split('/')[4].split('_')[:2]
1596 format = "-".join( format )
1597 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1602 'uploader': video_uploader,
1603 'upload_date': upload_date,
1604 'title': video_title,
1607 'thumbnail': thumbnail,
1608 'description': video_description
1611 if self._downloader.params.get('listformats', None):
1612 self._print_formats(formats)
1615 req_format = self._downloader.params.get('format', None)
1616 self.to_screen(u'Format: %s' % req_format)
1618 if req_format is None or req_format == 'best':
1620 elif req_format == 'worst':
1621 return [formats[-1]]
1622 elif req_format in ('-1', 'all'):
1625 format = self._specific( req_format, formats )
1627 raise ExtractorError(u'Requested format not available')
1632 class PornotubeIE(InfoExtractor):
1633 """Information extractor for pornotube.com."""
1634 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1636 def _real_extract(self, url):
1637 mobj = re.match(self._VALID_URL, url)
1639 raise ExtractorError(u'Invalid URL: %s' % url)
1641 video_id = mobj.group('videoid')
1642 video_title = mobj.group('title')
1644 # Get webpage content
1645 webpage = self._download_webpage(url, video_id)
1648 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1649 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1650 video_url = compat_urllib_parse.unquote(video_url)
1652 #Get the uploaded date
1653 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1654 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1655 if upload_date: upload_date = unified_strdate(upload_date)
1657 info = {'id': video_id,
1660 'upload_date': upload_date,
1661 'title': video_title,
1667 class YouJizzIE(InfoExtractor):
1668 """Information extractor for youjizz.com."""
1669 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1671 def _real_extract(self, url):
1672 mobj = re.match(self._VALID_URL, url)
1674 raise ExtractorError(u'Invalid URL: %s' % url)
1676 video_id = mobj.group('videoid')
1678 # Get webpage content
1679 webpage = self._download_webpage(url, video_id)
1681 # Get the video title
1682 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1683 webpage, u'title').strip()
1685 # Get the embed page
1686 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1688 raise ExtractorError(u'ERROR: unable to extract embed page')
1690 embed_page_url = result.group(0).strip()
1691 video_id = result.group('videoid')
1693 webpage = self._download_webpage(embed_page_url, video_id)
1696 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1697 webpage, u'video URL')
1699 info = {'id': video_id,
1701 'title': video_title,
1704 'player_url': embed_page_url}
1708 class EightTracksIE(InfoExtractor):
1710 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1712 def _real_extract(self, url):
1713 mobj = re.match(self._VALID_URL, url)
1715 raise ExtractorError(u'Invalid URL: %s' % url)
1716 playlist_id = mobj.group('id')
1718 webpage = self._download_webpage(url, playlist_id)
1720 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1721 data = json.loads(json_like)
1723 session = str(random.randint(0, 1000000000))
1725 track_count = data['tracks_count']
1726 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1727 next_url = first_url
1729 for i in itertools.count():
1730 api_json = self._download_webpage(next_url, playlist_id,
1731 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1732 errnote=u'Failed to download song information')
1733 api_data = json.loads(api_json)
1734 track_data = api_data[u'set']['track']
1736 'id': track_data['id'],
1737 'url': track_data['track_file_stream_url'],
1738 'title': track_data['performer'] + u' - ' + track_data['name'],
1739 'raw_title': track_data['name'],
1740 'uploader_id': data['user']['login'],
1744 if api_data['set']['at_last_track']:
1746 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1749 class KeekIE(InfoExtractor):
1750 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1753 def _real_extract(self, url):
1754 m = re.match(self._VALID_URL, url)
1755 video_id = m.group('videoID')
1757 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1758 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1759 webpage = self._download_webpage(url, video_id)
1761 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1764 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1765 webpage, u'uploader', fatal=False)
1771 'title': video_title,
1772 'thumbnail': thumbnail,
1773 'uploader': uploader
1777 class TEDIE(InfoExtractor):
1778 _VALID_URL=r'''http://www\.ted\.com/
1780 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1782 ((?P<type_talk>talks)) # We have a simple talk
1784 (/lang/(.*?))? # The url may contain the language
1785 /(?P<name>\w+) # Here goes the name and then ".html"
1789 def suitable(cls, url):
1790 """Receives a URL and returns True if suitable for this IE."""
1791 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1793 def _real_extract(self, url):
1794 m=re.match(self._VALID_URL, url, re.VERBOSE)
1795 if m.group('type_talk'):
1796 return [self._talk_info(url)]
1798 playlist_id=m.group('playlist_id')
1799 name=m.group('name')
1800 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1801 return [self._playlist_videos_info(url,name,playlist_id)]
1803 def _playlist_videos_info(self,url,name,playlist_id=0):
1804 '''Returns the videos of the playlist'''
1806 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1807 ([.\s]*?)data-playlist_item_id="(\d+)"
1808 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1810 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1811 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1812 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1813 m_names=re.finditer(video_name_RE,webpage)
1815 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1816 webpage, 'playlist title')
1818 playlist_entries = []
1819 for m_video, m_name in zip(m_videos,m_names):
1820 video_id=m_video.group('video_id')
1821 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1822 playlist_entries.append(self.url_result(talk_url, 'TED'))
1823 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1825 def _talk_info(self, url, video_id=0):
1826 """Return the video for the talk in the url"""
1827 m = re.match(self._VALID_URL, url,re.VERBOSE)
1828 video_name = m.group('name')
1829 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1830 self.report_extraction(video_name)
1831 # If the url includes the language we get the title translated
1832 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1834 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1835 webpage, 'json data')
1836 info = json.loads(json_data)
1837 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1838 webpage, 'description', flags = re.DOTALL)
1840 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1841 webpage, 'thumbnail')
1844 'url': info['htmlStreams'][-1]['file'],
1847 'thumbnail': thumbnail,
1848 'description': desc,
1852 class MySpassIE(InfoExtractor):
1853 _VALID_URL = r'http://www.myspass.de/.*'
1855 def _real_extract(self, url):
1856 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1858 # video id is the last path element of the URL
1859 # usually there is a trailing slash, so also try the second but last
1860 url_path = compat_urllib_parse_urlparse(url).path
1861 url_parent_path, video_id = os.path.split(url_path)
1863 _, video_id = os.path.split(url_parent_path)
1866 metadata_url = META_DATA_URL_TEMPLATE % video_id
1867 metadata_text = self._download_webpage(metadata_url, video_id)
1868 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1870 # extract values from metadata
1871 url_flv_el = metadata.find('url_flv')
1872 if url_flv_el is None:
1873 raise ExtractorError(u'Unable to extract download url')
1874 video_url = url_flv_el.text
1875 extension = os.path.splitext(video_url)[1][1:]
1876 title_el = metadata.find('title')
1877 if title_el is None:
1878 raise ExtractorError(u'Unable to extract title')
1879 title = title_el.text
1880 format_id_el = metadata.find('format_id')
1881 if format_id_el is None:
1884 format = format_id_el.text
1885 description_el = metadata.find('description')
1886 if description_el is not None:
1887 description = description_el.text
1890 imagePreview_el = metadata.find('imagePreview')
1891 if imagePreview_el is not None:
1892 thumbnail = imagePreview_el.text
1901 'thumbnail': thumbnail,
1902 'description': description
1906 class SpiegelIE(InfoExtractor):
1907 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1909 def _real_extract(self, url):
1910 m = re.match(self._VALID_URL, url)
1911 video_id = m.group('videoID')
1913 webpage = self._download_webpage(url, video_id)
1915 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1918 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1919 xml_code = self._download_webpage(xml_url, video_id,
1920 note=u'Downloading XML', errnote=u'Failed to download XML')
1922 idoc = xml.etree.ElementTree.fromstring(xml_code)
1923 last_type = idoc[-1]
1924 filename = last_type.findall('./filename')[0].text
1925 duration = float(last_type.findall('./duration')[0].text)
1927 video_url = 'http://video2.spiegel.de/flash/' + filename
1928 video_ext = filename.rpartition('.')[2]
1933 'title': video_title,
1934 'duration': duration,
1938 class LiveLeakIE(InfoExtractor):
1940 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1941 IE_NAME = u'liveleak'
1943 def _real_extract(self, url):
1944 mobj = re.match(self._VALID_URL, url)
1946 raise ExtractorError(u'Invalid URL: %s' % url)
1948 video_id = mobj.group('video_id')
1950 webpage = self._download_webpage(url, video_id)
1952 video_url = self._search_regex(r'file: "(.*?)",',
1953 webpage, u'video URL')
1955 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1956 webpage, u'title').replace('LiveLeak.com -', '').strip()
1958 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1959 webpage, u'description', fatal=False)
1961 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1962 webpage, u'uploader', fatal=False)
1968 'title': video_title,
1969 'description': video_description,
1970 'uploader': video_uploader
1977 class TumblrIE(InfoExtractor):
1978 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1980 def _real_extract(self, url):
1981 m_url = re.match(self._VALID_URL, url)
1982 video_id = m_url.group('id')
1983 blog = m_url.group('blog_name')
1985 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1986 webpage = self._download_webpage(url, video_id)
1988 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1989 video = re.search(re_video, webpage)
1991 raise ExtractorError(u'Unable to extract video')
1992 video_url = video.group('video_url')
1993 ext = video.group('ext')
1995 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1996 webpage, u'thumbnail', fatal=False) # We pick the first poster
1997 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1999 # The only place where you can get a title, it's not complete,
2000 # but searching in other places doesn't work for all videos
2001 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2002 webpage, u'title', flags=re.DOTALL)
2004 return [{'id': video_id,
2006 'title': video_title,
2007 'thumbnail': video_thumbnail,
2011 class BandcampIE(InfoExtractor):
2012 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2014 def _real_extract(self, url):
2015 mobj = re.match(self._VALID_URL, url)
2016 title = mobj.group('title')
2017 webpage = self._download_webpage(url, title)
2018 # We get the link to the free download page
2019 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2020 if m_download is None:
2021 raise ExtractorError(u'No free songs found')
2023 download_link = m_download.group(1)
2024 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2025 webpage, re.MULTILINE|re.DOTALL).group('id')
2027 download_webpage = self._download_webpage(download_link, id,
2028 'Downloading free downloads page')
2029 # We get the dictionary of the track from some javascrip code
2030 info = re.search(r'items: (.*?),$',
2031 download_webpage, re.MULTILINE).group(1)
2032 info = json.loads(info)[0]
2033 # We pick mp3-320 for now, until format selection can be easily implemented.
2034 mp3_info = info[u'downloads'][u'mp3-320']
2035 # If we try to use this url it says the link has expired
2036 initial_url = mp3_info[u'url']
2037 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2038 m_url = re.match(re_url, initial_url)
2039 #We build the url we will use to get the final track url
2040 # This url is build in Bandcamp in the script download_bunde_*.js
2041 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2042 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2043 # If we could correctly generate the .rand field the url would be
2044 #in the "download_url" key
2045 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2047 track_info = {'id':id,
2048 'title' : info[u'title'],
2051 'thumbnail' : info[u'thumb_url'],
2052 'uploader' : info[u'artist']
2057 class RedTubeIE(InfoExtractor):
2058 """Information Extractor for redtube"""
2059 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2061 def _real_extract(self,url):
2062 mobj = re.match(self._VALID_URL, url)
2064 raise ExtractorError(u'Invalid URL: %s' % url)
2066 video_id = mobj.group('id')
2067 video_extension = 'mp4'
2068 webpage = self._download_webpage(url, video_id)
2070 self.report_extraction(video_id)
2072 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2073 webpage, u'video URL')
2075 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2081 'ext': video_extension,
2082 'title': video_title,
2085 class InaIE(InfoExtractor):
2086 """Information Extractor for Ina.fr"""
2087 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2089 def _real_extract(self,url):
2090 mobj = re.match(self._VALID_URL, url)
2092 video_id = mobj.group('id')
2093 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2094 video_extension = 'mp4'
2095 webpage = self._download_webpage(mrss_url, video_id)
2097 self.report_extraction(video_id)
2099 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2100 webpage, u'video URL')
2102 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2108 'ext': video_extension,
2109 'title': video_title,
2112 class HowcastIE(InfoExtractor):
2113 """Information Extractor for Howcast.com"""
2114 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2116 def _real_extract(self, url):
2117 mobj = re.match(self._VALID_URL, url)
2119 video_id = mobj.group('id')
2120 webpage_url = 'http://www.howcast.com/videos/' + video_id
2121 webpage = self._download_webpage(webpage_url, video_id)
2123 self.report_extraction(video_id)
2125 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2126 webpage, u'video URL')
2128 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2131 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2132 webpage, u'description', fatal=False)
2134 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2135 webpage, u'thumbnail', fatal=False)
2141 'title': video_title,
2142 'description': video_description,
2143 'thumbnail': thumbnail,
2146 class VineIE(InfoExtractor):
2147 """Information Extractor for Vine.co"""
2148 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2150 def _real_extract(self, url):
2151 mobj = re.match(self._VALID_URL, url)
2153 video_id = mobj.group('id')
2154 webpage_url = 'https://vine.co/v/' + video_id
2155 webpage = self._download_webpage(webpage_url, video_id)
2157 self.report_extraction(video_id)
2159 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2160 webpage, u'video URL')
2162 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2165 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2166 webpage, u'thumbnail', fatal=False)
2168 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2169 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2175 'title': video_title,
2176 'thumbnail': thumbnail,
2177 'uploader': uploader,
2180 class FlickrIE(InfoExtractor):
2181 """Information Extractor for Flickr videos"""
2182 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2184 def _real_extract(self, url):
2185 mobj = re.match(self._VALID_URL, url)
2187 video_id = mobj.group('id')
2188 video_uploader_id = mobj.group('uploader_id')
2189 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2190 webpage = self._download_webpage(webpage_url, video_id)
2192 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2194 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2195 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2197 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2198 first_xml, u'node_id')
2200 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2201 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2203 self.report_extraction(video_id)
2205 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2207 raise ExtractorError(u'Unable to extract video url')
2208 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2210 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2211 webpage, u'video title')
2213 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2214 webpage, u'description', fatal=False)
2216 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2217 webpage, u'thumbnail', fatal=False)
2223 'title': video_title,
2224 'description': video_description,
2225 'thumbnail': thumbnail,
2226 'uploader_id': video_uploader_id,
2229 class TeamcocoIE(InfoExtractor):
2230 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2232 def _real_extract(self, url):
2233 mobj = re.match(self._VALID_URL, url)
2235 raise ExtractorError(u'Invalid URL: %s' % url)
2236 url_title = mobj.group('url_title')
2237 webpage = self._download_webpage(url, url_title)
2239 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2240 webpage, u'video id')
2242 self.report_extraction(video_id)
2244 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2247 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2248 webpage, u'thumbnail', fatal=False)
2250 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2251 webpage, u'description', fatal=False)
2253 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2254 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2256 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2263 'title': video_title,
2264 'thumbnail': thumbnail,
2265 'description': video_description,
2268 class XHamsterIE(InfoExtractor):
2269 """Information Extractor for xHamster"""
2270 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2272 def _real_extract(self,url):
2273 mobj = re.match(self._VALID_URL, url)
2275 video_id = mobj.group('id')
2276 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2277 webpage = self._download_webpage(mrss_url, video_id)
2279 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2281 raise ExtractorError(u'Unable to extract media URL')
2282 if len(mobj.group('server')) == 0:
2283 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2285 video_url = mobj.group('server')+'/key='+mobj.group('file')
2286 video_extension = video_url.split('.')[-1]
2288 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2291 # Can't see the description anywhere in the UI
2292 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2293 # webpage, u'description', fatal=False)
2294 # if video_description: video_description = unescapeHTML(video_description)
2296 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2298 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2300 video_upload_date = None
2301 self._downloader.report_warning(u'Unable to extract upload date')
2303 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2304 webpage, u'uploader id', default=u'anonymous')
2306 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2307 webpage, u'thumbnail', fatal=False)
2312 'ext': video_extension,
2313 'title': video_title,
2314 # 'description': video_description,
2315 'upload_date': video_upload_date,
2316 'uploader_id': video_uploader_id,
2317 'thumbnail': video_thumbnail
2320 class HypemIE(InfoExtractor):
2321 """Information Extractor for hypem"""
2322 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2324 def _real_extract(self, url):
2325 mobj = re.match(self._VALID_URL, url)
2327 raise ExtractorError(u'Invalid URL: %s' % url)
2328 track_id = mobj.group(1)
2330 data = { 'ax': 1, 'ts': time.time() }
2331 data_encoded = compat_urllib_parse.urlencode(data)
2332 complete_url = url + "?" + data_encoded
2333 request = compat_urllib_request.Request(complete_url)
2334 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2335 cookie = urlh.headers.get('Set-Cookie', '')
2337 self.report_extraction(track_id)
2339 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2340 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2342 track_list = json.loads(html_tracks)
2343 track = track_list[u'tracks'][0]
2345 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2348 track_id = track[u"id"]
2349 artist = track[u"artist"]
2350 title = track[u"song"]
2352 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2353 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2354 request.add_header('cookie', cookie)
2355 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2357 song_data = json.loads(song_data_json)
2359 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2360 final_url = song_data[u"url"]
2370 class Vbox7IE(InfoExtractor):
2371 """Information Extractor for Vbox7"""
2372 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2374 def _real_extract(self,url):
2375 mobj = re.match(self._VALID_URL, url)
2377 raise ExtractorError(u'Invalid URL: %s' % url)
2378 video_id = mobj.group(1)
2380 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2381 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2382 redirect_url = urlh.geturl() + new_location
2383 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2385 title = self._html_search_regex(r'<title>(.*)</title>',
2386 webpage, u'title').split('/')[0].strip()
2389 info_url = "http://vbox7.com/play/magare.do"
2390 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2391 info_request = compat_urllib_request.Request(info_url, data)
2392 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2393 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2394 if info_response is None:
2395 raise ExtractorError(u'Unable to extract the media url')
2396 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2403 'thumbnail': thumbnail_url,
2407 def gen_extractors():
2408 """ Return a list of an instance of every supported extractor.
2409 The order does matter; the first extractor matched is the one handling the URL.
2412 YoutubePlaylistIE(),
2437 StanfordOpenClassroomIE(),
2447 WorldStarHipHopIE(),
2477 def get_info_extractor(ie_name):
2478 """Returns the info extractor class with the given ie_name"""
2479 return globals()[ie_name+'IE']