http://video[0-9].pornotube.com/.+\.flv)",'
+ result = re.search(VIDEO_URL_RE, webpage)
+ if result is None:
+ raise ExtractorError(u'Unable to extract video url')
+ video_url = compat_urllib_parse.unquote(result.group('url'))
- # Step 1, Retrieve post webpage to extract further information
- self.report_extract_entry(post_url)
- request = compat_urllib_request.Request(post_url)
- try:
- webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
- return
+ #Get the uploaded date
+ VIDEO_UPLOADED_RE = r'Added (?P[0-9\/]+) by'
+ result = re.search(VIDEO_UPLOADED_RE, webpage)
+ if result is None:
+ raise ExtractorError(u'Unable to extract video title')
+ upload_date = unified_strdate(result.group('date'))
- # Extract update date
- upload_date = None
- pattern = 'title="Timestamp">(.*?)'
- mobj = re.search(pattern, webpage)
- if mobj:
- upload_date = mobj.group(1)
- # Convert timestring to a format suitable for filename
- upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
- upload_date = upload_date.strftime('%Y%m%d')
- self.report_date(upload_date)
+ info = {'id': video_id,
+ 'url': video_url,
+ 'uploader': None,
+ 'upload_date': upload_date,
+ 'title': video_title,
+ 'ext': 'flv',
+ 'format': 'flv'}
- # Extract uploader
- uploader = None
- pattern = r'rel\="author".*?>(.*?)'
- mobj = re.search(pattern, webpage)
- if mobj:
- uploader = mobj.group(1)
- self.report_uploader(uploader)
+ return [info]
- # Extract title
- # Get the first line for title
- video_title = u'NA'
- pattern = r'[^.]+).html$'
- # Step 2, Stimulate clicking the image box to launch video
- pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
- mobj = re.search(pattern, webpage)
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video page URL')
+ raise ExtractorError(u'Invalid URL: %s' % url)
- video_page = mobj.group(1)
- request = compat_urllib_request.Request(video_page)
- try:
- webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
- return
- self.report_extract_vid_page(video_page)
+ video_id = mobj.group('videoid')
+ # Get webpage content
+ webpage = self._download_webpage(url, video_id)
- # Extract video links on video page
- """Extract video links of all sizes"""
- pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
- mobj = re.findall(pattern, webpage)
- if len(mobj) == 0:
- self._downloader.trouble(u'ERROR: unable to extract video links')
+ # Get the video title
+ result = re.search(r'(?P.*)', webpage)
+ if result is None:
+ raise ExtractorError(u'ERROR: unable to extract video title')
+ video_title = result.group('title').strip()
- # Sort in resolution
- links = sorted(mobj)
+ # Get the embed page
+ result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage)
+ if result is None:
+ raise ExtractorError(u'ERROR: unable to extract embed page')
- # Choose the lowest of the sort, i.e. highest resolution
- video_url = links[-1]
- # Only get the url. The resolution part in the tuple has no use anymore
- video_url = video_url[-1]
- # Treat escaped \u0026 style hex
- try:
- video_url = video_url.decode("unicode_escape")
- except AttributeError: # Python 3
- video_url = bytes(video_url, 'ascii').decode('unicode-escape')
+ embed_page_url = result.group(0).strip()
+ video_id = result.group('videoid')
+ webpage = self._download_webpage(embed_page_url, video_id)
- return [{
- 'id': video_id,
- 'url': video_url,
- 'uploader': uploader,
- 'upload_date': upload_date,
- 'title': video_title,
- 'ext': video_extension,
- }]
+ # Get the video URL
+ result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', webpage)
+ if result is None:
+ raise ExtractorError(u'ERROR: unable to extract video url')
+ video_url = result.group('source')
-class NBAIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
- IE_NAME = u'nba'
+ info = {'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ 'ext': 'flv',
+ 'format': 'flv',
+ 'player_url': embed_page_url}
- def report_extraction(self, video_id):
- self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
+ return [info]
+
+class EightTracksIE(InfoExtractor):
+ IE_NAME = '8tracks'
+ _VALID_URL = r'https?://8tracks.com/(?P[^/]+)/(?P[^/#]+)(?:#.*)?$'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
- return
+ raise ExtractorError(u'Invalid URL: %s' % url)
+ playlist_id = mobj.group('id')
- video_id = mobj.group(1)
- if video_id.endswith('/index.html'):
- video_id = video_id[:-len('/index.html')]
+ webpage = self._download_webpage(url, playlist_id)
- self.report_extraction(video_id)
- try:
- urlh = compat_urllib_request.urlopen(url)
- webpage_bytes = urlh.read()
- webpage = webpage_bytes.decode('utf-8', 'ignore')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
- return
+ m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
+ if not m:
+ raise ExtractorError(u'Cannot find trax information')
+ json_like = m.group(1)
+ data = json.loads(json_like)
+
+ session = str(random.randint(0, 1000000000))
+ mix_id = data['id']
+ track_count = data['tracks_count']
+ first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
+ next_url = first_url
+ res = []
+ for i in itertools.count():
+ api_json = self._download_webpage(next_url, playlist_id,
+ note=u'Downloading song information %s/%s' % (str(i+1), track_count),
+ errnote=u'Failed to download song information')
+ api_data = json.loads(api_json)
+ track_data = api_data[u'set']['track']
+ info = {
+ 'id': track_data['id'],
+ 'url': track_data['track_file_stream_url'],
+ 'title': track_data['performer'] + u' - ' + track_data['name'],
+ 'raw_title': track_data['name'],
+ 'uploader_id': data['user']['login'],
+ 'ext': 'm4a',
+ }
+ res.append(info)
+ if api_data['set']['at_last_track']:
+ break
+ next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
+ return res
- video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
- def _findProp(rexp, default=None):
- m = re.search(rexp, webpage)
- if m:
- return unescapeHTML(m.group(1))
- else:
- return default
+class KeekIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)'
+ IE_NAME = u'keek'
- shortened_video_id = video_id.rpartition('/')[2]
- title = _findProp(r'[\S\s]+?(?P.+?)
', webpage)
+ uploader = clean_html(m.group('uploader'))
info = {
- 'id': shortened_video_id,
- 'url': video_url,
- 'ext': 'mp4',
- 'title': title,
- 'uploader_date': _findProp(r'Date: (.*?)
'),
- 'description': _findProp(r'(.*?)'),
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader
}
return [info]
-class JustinTVIE(InfoExtractor):
- """Information extractor for justin.tv and twitch.tv"""
- # TODO: One broadcast may be split into multiple videos. The key
- # 'broadcast_id' is the same for all parts, and 'broadcast_part'
- # starts at 1 and increases. Can we treat all parts as one video?
-
- _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
- ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
- _JUSTIN_PAGE_LIMIT = 100
- IE_NAME = u'justin.tv'
+class TEDIE(InfoExtractor):
+ _VALID_URL=r'''http://www\.ted\.com/
+ (
+ ((?P
playlists)/(?P\d+)) # We have a playlist
+ |
+ ((?Ptalks)) # We have a simple talk
+ )
+ (/lang/(.*?))? # The url may contain the language
+ /(?P\w+) # Here goes the name and then ".html"
+ '''
+
+ @classmethod
+ def suitable(cls, url):
+ """Receives a URL and returns True if suitable for this IE."""
+ return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
- def report_extraction(self, file_id):
- """Report information extraction."""
- self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
+ def _real_extract(self, url):
+ m=re.match(self._VALID_URL, url, re.VERBOSE)
+ if m.group('type_talk'):
+ return [self._talk_info(url)]
+ else :
+ playlist_id=m.group('playlist_id')
+ name=m.group('name')
+ self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
+ return [self._playlist_videos_info(url,name,playlist_id)]
+
+ def _talk_video_link(self,mediaSlug):
+ '''Returns the video link for that mediaSlug'''
+ return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
+
+ def _playlist_videos_info(self,url,name,playlist_id=0):
+ '''Returns the videos of the playlist'''
+ video_RE=r'''
+ \d+)"
+ ([.\s]*?)data-playlist_item_id="(\d+)"
+ ([.\s]*?)data-mediaslug="(?P.+?)"
+ '''
+ video_name_RE=r'(?P.+?)
'
+ webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
+ m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
+ m_names=re.finditer(video_name_RE,webpage)
+
+ playlist_RE = r'div class="headline">(\s*?)(\s*?)(?P.*?)'
+ m_playlist = re.search(playlist_RE, webpage)
+ playlist_title = m_playlist.group('playlist_title')
+
+ playlist_entries = []
+ for m_video, m_name in zip(m_videos,m_names):
+ video_id=m_video.group('video_id')
+ talk_url='http://www.ted.com%s' % m_name.group('talk_url')
+ playlist_entries.append(self.url_result(talk_url, 'TED'))
+ return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
+
+ def _talk_info(self, url, video_id=0):
+ """Return the video for the talk in the url"""
+ m=re.match(self._VALID_URL, url,re.VERBOSE)
+ videoName=m.group('name')
+ webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
+ # If the url includes the language we get the title translated
+ title_RE=r'(?P.*)'
+ title=re.search(title_RE, webpage).group('title')
+ info_RE=r'''