X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=youtube-dl;a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=35d427eec974a14ff27425780c13ae18be1f3aa7;hp=0852b8e8c1c15e3600ed16d5e95d9cac901f32bc;hb=9d6ac71c27b1dfb662c795ef598dbfd0286682da;hpb=46b18f2349670d395b9d84a57ee3d9b5d221ff4b diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0852b8e8c..35d427eec 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import base64 @@ -26,8 +27,12 @@ from ..compat import ( compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, +) +from ..downloader.f4m import ( + get_base_url, + remove_encrypted_media, ) -from ..downloader.f4m import remove_encrypted_media from ..utils import ( NO_DEFAULT, age_restricted, @@ -244,6 +249,10 @@ class InfoExtractor(object): specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) The following fields should only be used when the video belongs to some logical chapter or section: @@ -292,8 +301,9 @@ class InfoExtractor(object): There must be a key "entries", which is a list, an iterable, or a PagedList object, each element of which is a valid dictionary by this specification. - Additionally, playlists can have "title", "description" and "id" attributes - with the same semantics as videos (see above). + Additionally, playlists can have "id", "title", "description", "uploader", + "uploader_id", "uploader_url" attributes with the same semantics as videos + (see above). _type "multi_video" indicates that there are multiple videos that @@ -371,7 +381,7 @@ class InfoExtractor(object): cls._VALID_URL_RE = re.compile(cls._VALID_URL) m = cls._VALID_URL_RE.match(url) assert m - return m.group('id') + return compat_str(m.group('id')) @classmethod def working(cls): @@ -415,7 +425,7 @@ class InfoExtractor(object): if country_code: self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._downloader.params.get('verbose', False): - self._downloader.to_stdout( + self._downloader.to_screen( '[debug] Using fake IP %s (%s) as X-Forwarded-For.' % (self._x_forwarded_for_ip, country_code.upper())) @@ -485,6 +495,16 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) + + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + if 'X-Forwarded-For' not in headers: + headers['X-Forwarded-For'] = self._x_forwarded_for_ip + if isinstance(url_or_request, compat_urllib_request.Request): url_or_request = update_Request( url_or_request, data=data, headers=headers, query=query) @@ -514,15 +534,6 @@ class InfoExtractor(object): if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - # Some sites check X-Forwarded-For HTTP header in order to figure out - # the origin of the client behind proxy. This allows bypassing geo - # restriction by faking this header's value to IP that belongs to some - # geo unrestricted country. We will do so once we encounter any - # geo restriction error. - if self._x_forwarded_for_ip: - if 'X-Forwarded-For' not in headers: - headers['X-Forwarded-For'] = self._x_forwarded_for_ip - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) if urlh is False: assert not fatal @@ -547,6 +558,34 @@ class InfoExtractor(object): return encoding + def __check_blocked(self, content): + first_block = content[:512] + if ('Access to this site is blocked' in content and + 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'