X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=ef02b68966e88d8d1cbc2f9d628e5a79fe8ce3c3;hb=7c360e3a04f09b912f51034c7778eb2297872e86;hp=02a82dc57cf5b0d07586d39614ae9f03acd81b5e;hpb=50317b111dadccba73bcdd828d9997d1da78a5f1;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 02a82dc57..ef02b6896 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -66,14 +66,15 @@ class InfoExtractor(object): * asr Audio sampling rate in Hertz * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use + * container Name of the container format * filesize The number of bytes, if known in advance * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. - "http", "https", "rtsp", "rtmp" or so. + "http", "https", "rtsp", "rtmp", "m3u8" or so. * preference Order number of this format. If this field is present and not None, the formats get sorted - by this field. + by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. * quality Order number of the video quality of this @@ -87,12 +88,18 @@ class InfoExtractor(object): The following fields are optional: + display_id An alternative identifier for the video, not necessarily + unique, but available before title. Typically, id is + something like "4234987", title "Dancing naked mole rats", + and display_id "dancing-naked-mole-rats" thumbnails: A list of dictionaries (with the entries "resolution" and "url") for the varying thumbnails thumbnail: Full URL to a video thumbnail image. description: One-line video description. uploader: Full name of the video uploader. + timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). + If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location of the video. subtitles: The subtitle file contents as a dictionary in the format @@ -113,9 +120,6 @@ class InfoExtractor(object): _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. - _real_extract() must return a *list* of information dictionaries as - described above. - Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ @@ -239,7 +243,7 @@ class InfoExtractor(object): except AttributeError: url = url_or_request if len(url) > 200: - h = u'___' + hashlib.md5(url).hexdigest() + h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest() url = url[:200 - len(h)] + h raw_filename = ('%s_%s.dump' % (video_id, url)) filename = sanitize_filename(raw_filename, restricted=True) @@ -247,7 +251,21 @@ class InfoExtractor(object): with open(filename, 'wb') as outf: outf.write(webpage_bytes) - content = webpage_bytes.decode(encoding, 'replace') + try: + content = webpage_bytes.decode(encoding, 'replace') + except LookupError: + content = webpage_bytes.decode('utf-8', 'replace') + + if (u'Access to this site is blocked' in content and + u'Websense' in content[:512]): + msg = u'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'