"""Information Extractor class.
Information extractors are the classes that, given a URL, extract
- information from the video (or videos) the URL refers to. This
- information includes the real video URL, the video title and simplified
- title, author and others. The information is stored in a dictionary
- which is then passed to the FileDownloader. The FileDownloader
- processes this information possibly downloading the video to the file
- system, among other possible outcomes. The dictionaries must include
- the following fields:
-
- id: Video identifier.
- url: Final video URL.
- uploader: Nickname of the video uploader.
- title: Literal title.
- ext: Video filename extension.
- format: Video format.
- player_url: SWF Player URL (may be None).
-
- The following fields are optional. Their primary purpose is to allow
- youtube-dl to serve as the backend for a video search function, such
- as the one in youtube2mp3. They are only used when their respective
- forced printing functions are called:
-
- thumbnail: Full URL to a video thumbnail image.
- description: One-line video description.
+ information about the video (or videos) the URL refers to. This
+ information includes the real video URL, the video title, author and
+ others. The information is stored in a dictionary which is then
+ passed to the FileDownloader. The FileDownloader processes this
+ information possibly downloading the video to the file system, among
+ other possible outcomes.
+
+ The dictionaries must include the following fields:
+
+ id: Video identifier.
+ url: Final video URL.
+ uploader: Nickname of the video uploader.
+ upload_date: Video upload date (YYYYMMDD).
+ title: Video title, unescaped.
+ ext: Video filename extension.
+
+ The following fields are optional:
+
+ format: The video format, defaults to ext (used for --get-format)
+ thumbnail: Full URL to a video thumbnail image.
+ description: One-line video description.
+ player_url: SWF Player URL (used for rtmpdump).
Subclasses of this one should re-define the _real_initialize() and
_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.
+
+ _real_extract() must return a *list* of information dictionaries as
+ described above.
"""
_ready = False
# Extension
video_extension = self._video_extensions.get(format_param, 'flv')
+ video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
+ self._video_dimensions.get(format_param, '???'))
+
results.append({
'id': video_id.decode('utf-8'),
'url': video_real_url.decode('utf-8'),
'upload_date': upload_date,
'title': video_title,
'ext': video_extension.decode('utf-8'),
- 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
+ 'format': video_format,
'thumbnail': video_thumbnail.decode('utf-8'),
'description': video_description,
'player_url': player_url,
'upload_date': u'NA',
'title': video_title,
'ext': video_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
}]
'upload_date': video_upload_date,
'title': video_title,
'ext': video_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
}]
'upload_date': u'NA',
'title': video_title,
'ext': video_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
}]
'upload_date': u'NA',
'title': video_title,
'ext': video_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
}]
'thumbnail': video_thumbnail.decode('utf-8'),
'description': video_description,
'thumbnail': video_thumbnail,
- 'player_url': None,
}]
'ext': video_extension,
'thumbnail': video_thumbnail,
'description': video_description,
- 'player_url': None,
}]
'upload_date': u'NA',
'title': video_title,
'ext': video_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
}]
'upload_date': u'NA',
'title': file_title,
'ext': file_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
}]
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
'thumbnail': video_thumbnail.decode('utf-8'),
'description': video_description.decode('utf-8'),
- 'player_url': None,
})
return results
'upload_date': u'NA',
'title': video_title,
'ext': u'flv',
- 'format': u'NA',
- 'player_url': None,
}]
class ComedyCentralIE(InfoExtractor):
_VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
IE_NAME = u'comedycentral'
+ _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
+
+ _video_extensions = {
+ '3500': 'mp4',
+ '2200': 'mp4',
+ '1700': 'mp4',
+ '1200': 'mp4',
+ '750': 'mp4',
+ '400': 'mp4',
+ }
+ _video_dimensions = {
+ '3500': '1280x720',
+ '2200': '960x540',
+ '1700': '768x432',
+ '1200': '640x360',
+ '750': '512x288',
+ '400': '384x216',
+ }
+
def report_extraction(self, episode_id):
self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
def report_player_url(self, episode_id):
self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
+
+ def _print_formats(self, formats):
+ print('Available formats:')
+ for x in formats:
+ print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
+
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
epTitle = mobj.group('episode')
mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
+
if len(mMovieParams) == 0:
- self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
- return
+ # The Colbert Report embeds the information in a without
+ # a URL prefix; so extract the alternate reference
+ # and then add the URL prefix manually.
+ altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
+ if len(altMovieParams) == 0:
+ self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
+ return
+ else:
+ mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
+
playerUrl_raw = mMovieParams[0][0]
self.report_player_url(epTitle)
try:
if len(turls) == 0:
self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
continue
+
+ if self._downloader.params.get('listformats', None):
+ self._print_formats([i[0] for i in turls])
+ return
# For now, just pick the highest bitrate
format,video_url = turls[-1]
+ # Get the format arg from the arg stream
+ req_format = self._downloader.params.get('format', None)
+
+ # Select format if we can find one
+ for f,v in turls:
+ if f == req_format:
+ format, video_url = f, v
+ break
+
+ # Patch to download from alternative CDN, which does not
+ # break on current RTMPDump builds
+ broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
+ better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
+
+ if video_url.startswith(broken_cdn):
+ video_url = video_url.replace(broken_cdn, better_cdn)
+
effTitle = showId + u'-' + epTitle
info = {
'id': shortMediaId,
'format': format,
'thumbnail': None,
'description': officialTitle,
- 'player_url': playerUrl
+ 'player_url': None #playerUrl
}
results.append(info)
'upload_date': None,
'title': showName,
'ext': 'flv',
- 'format': 'flv',
'thumbnail': imgUrl,
'description': description,
'player_url': playerUrl,
info['url'] = videoNode.findall('./file')[0].text
info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
info['ext'] = info['url'].rpartition('.')[2]
- info['format'] = info['ext']
except IndexError:
self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
return
'upload_date': None,
'title': video_title,
'ext': 'flv',
- 'format': 'flv',
'thumbnail': video_thumbnail,
'description': None,
- 'player_url': None,
}
return [info]
'upload_date': upload_date,
'title': title,
'ext': u'mp3',
- 'format': u'NA',
- 'player_url': None,
'description': description.decode('utf-8')
}]
'uploader': None,
'upload_date': None,
'title': video_title,
- 'ext': extension,
- 'format': extension, # Extension is always(?) mp4, but seems to be flv
+ 'ext': extension, # Extension is always(?) mp4, but seems to be flv
'thumbnail': None,
'description': video_description,
- 'player_url': None,
}
return [info]
self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
return
info['ext'] = info['url'].rpartition('.')[2]
- info['format'] = info['ext']
return [info]
elif mobj.group('course'): # A course page
course = mobj.group('course')
'uploader': None,
'title': video_title,
'ext': ext,
- 'format': u'NA'
}
files_info.append(info)
return
video_thumbnail = result.group(1).decode('utf-8')
- info = {'id': video_id,
- 'url': video_url,
- 'uploader': None,
- 'upload_date': None,
- 'title': video_title,
- 'ext': 'flv',
- 'format': 'flv',
- 'thumbnail': video_thumbnail,
- 'description': None,
- 'player_url': None}
-
- return [info]
+ return [{
+ 'id': video_id,
+ 'url': video_url,
+ 'uploader': None,
+ 'upload_date': None,
+ 'title': video_title,
+ 'ext': 'flv',
+ 'thumbnail': video_thumbnail,
+ 'description': None,
+ }]
class GooglePlusIE(InfoExtractor):
'upload_date': upload_date.decode('utf-8'),
'title': video_title.decode('utf-8'),
'ext': video_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
}]