errnote='Could not download DASH manifest')
formats = []
- for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
- url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
- if url_el is None:
- continue
- format_id = r.attrib['id']
- video_url = url_el.text
- filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
- f = {
- 'format_id': format_id,
- 'url': video_url,
- 'width': int_or_none(r.attrib.get('width')),
- 'height': int_or_none(r.attrib.get('height')),
- 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
- 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
- 'filesize': filesize,
- 'fps': int_or_none(r.attrib.get('frameRate')),
- }
- try:
- existing_format = next(
- fo for fo in formats
- if fo['format_id'] == format_id)
- except StopIteration:
- full_info = self._formats.get(format_id, {}).copy()
- full_info.update(f)
- formats.append(full_info)
- else:
- existing_format.update(f)
+ for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
+ mime_type = a.attrib.get('mimeType')
+ for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
+ url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
+ if url_el is None:
+ continue
+ if mime_type == 'text/vtt':
+ # TODO implement WebVTT downloading
+ pass
+ elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
+ format_id = r.attrib['id']
+ video_url = url_el.text
+ filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
+ f = {
+ 'format_id': format_id,
+ 'url': video_url,
+ 'width': int_or_none(r.attrib.get('width')),
+ 'height': int_or_none(r.attrib.get('height')),
+ 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
+ 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
+ 'filesize': filesize,
+ 'fps': int_or_none(r.attrib.get('frameRate')),
+ }
+ try:
+ existing_format = next(
+ fo for fo in formats
+ if fo['format_id'] == format_id)
+ except StopIteration:
+ full_info = self._formats.get(format_id, {}).copy()
+ full_info.update(f)
+ formats.append(full_info)
+ else:
+ existing_format.update(f)
+ else:
+ self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
def _real_extract(self, url):
def extract_videos_from_page(self, page):
ids_in_page = []
- for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
- if mobj.group(1) not in ids_in_page:
- ids_in_page.append(mobj.group(1))
- return ids_in_page
+ titles_in_page = []
+ for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
+ video_id = mobj.group('id')
+ video_title = unescapeHTML(mobj.group('title'))
+ try:
+ idx = ids_in_page.index(video_id)
+ if video_title and not titles_in_page[idx]:
+ titles_in_page[idx] = video_title
+ except ValueError:
+ ids_in_page.append(video_id)
+ titles_in_page.append(video_title)
+ return zip(ids_in_page, titles_in_page)
def _real_extract(self, url):
channel_id = self._match_id(url)
- video_ids = []
url = 'https://www.youtube.com/channel/%s/videos' % channel_id
channel_page = self._download_webpage(url, channel_id)
autogenerated = re.search(r'''(?x)
if autogenerated:
# The videos are contained in a single page
# the ajax pages can't be used, they are empty
- video_ids = self.extract_videos_from_page(channel_page)
entries = [
- self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in video_ids]
+ self.url_result(
+ video_id, 'Youtube', video_id=video_id,
+ video_title=video_title)
+ for video_id, video_title in self.extract_videos_from_page(channel_page)]
return self.playlist_result(entries, channel_id)
def _entries():
more_widget_html = content_html = channel_page
for pagenum in itertools.count(1):
- ids_in_page = self.extract_videos_from_page(content_html)
- for video_id in ids_in_page:
+ for video_id, video_title in self.extract_videos_from_page(content_html):
yield self.url_result(
- video_id, 'Youtube', video_id=video_id)
+ video_id, 'Youtube', video_id=video_id,
+ video_title=video_title)
mobj = re.search(
r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
while (PAGE_SIZE * pagenum) < limit:
result_url = self._API_URL % (
compat_urllib_parse.quote_plus(query.encode('utf-8')),
- (PAGE_SIZE * pagenum) + 1)
+ max((PAGE_SIZE * pagenum) + 1), 2)
data_json = self._download_webpage(
result_url, video_id='query "%s"' % query,
note='Downloading page %s' % (pagenum + 1),