X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=2a5a85dc67b4f7a57d04d4f21c1608aa2c47f7f3;hb=69a0c470b5cbcb789ef0358b7f13a18bf7564fc1;hp=77a13aea533d17aa57f17e01929ca3a276787844;hpb=f8b362739e4f469b501aa804beb95cf1cfb1c916;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 77a13aea5..2a5a85dc6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -35,6 +35,8 @@ class InfoExtractor(object): title: Video title, unescaped. ext: Video filename extension. + Instead of url and ext, formats can also specified. + The following fields are optional: format: The video format, defaults to ext (used for --get-format) @@ -52,8 +54,20 @@ class InfoExtractor(object): view_count: How many users have watched the video on the platform. urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen - - The fields should all be Unicode strings. + age_limit: Age restriction for the video, as an integer (years) + formats: A list of dictionaries for each format available, it must + be ordered from worst to best quality. Potential fields: + * url Mandatory. The URL of the video file + * ext Will be calculated from url if missing + * format A human-readable description of the format + ("mp4 container with h264/opus"). + Calculated from width and height if missing. + * format_id A short description of the format + ("mp4_h264_opus" or "19") + * width Width of the video, if known + * height Height of the video, if known + + Unless mentioned otherwise, the fields should be Unicode strings. Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. @@ -145,12 +159,17 @@ class InfoExtractor(object): urlh = self._request_webpage(url_or_request, video_id, note, errnote) content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) else: - encoding = 'utf-8' - webpage_bytes = urlh.read() + m = re.search(br']+charset=[\'"]?([^\'")]+)[ /\'">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + else: + encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() @@ -300,6 +319,15 @@ class InfoExtractor(object): self._og_regex('video')], html, name, **kargs) + def _rta_search(self, html): + # See http://www.rtalabel.org/index.php?content=howtofaq#single + if re.search(r'(?ix)