X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=ef02b68966e88d8d1cbc2f9d628e5a79fe8ce3c3;hb=24b8924b469bd1b75bcad82eec3823ad79472610;hp=6c5d77e583586b266d480fa4c01acf9949cfe3a7;hpb=b60016e83139ace517fc823cf2b22756e64c2e63;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6c5d77e58..ef02b6896 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -63,16 +63,18 @@ class InfoExtractor(object): * tbr Average bitrate of audio and video in KBit/s * abr Average audio bitrate in KBit/s * acodec Name of the audio codec in use + * asr Audio sampling rate in Hertz * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use + * container Name of the container format * filesize The number of bytes, if known in advance * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. - "http", "https", "rtsp", "rtmp" or so. + "http", "https", "rtsp", "rtmp", "m3u8" or so. * preference Order number of this format. If this field is present and not None, the formats get sorted - by this field. + by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. * quality Order number of the video quality of this @@ -86,12 +88,18 @@ class InfoExtractor(object): The following fields are optional: + display_id An alternative identifier for the video, not necessarily + unique, but available before title. Typically, id is + something like "4234987", title "Dancing naked mole rats", + and display_id "dancing-naked-mole-rats" thumbnails: A list of dictionaries (with the entries "resolution" and "url") for the varying thumbnails thumbnail: Full URL to a video thumbnail image. description: One-line video description. uploader: Full name of the video uploader. + timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). + If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location of the video. subtitles: The subtitle file contents as a dictionary in the format @@ -112,9 +120,6 @@ class InfoExtractor(object): _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. - _real_extract() must return a *list* of information dictionaries as - described above. - Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ @@ -238,7 +243,7 @@ class InfoExtractor(object): except AttributeError: url = url_or_request if len(url) > 200: - h = hashlib.md5(url).hexdigest() + h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest() url = url[:200 - len(h)] + h raw_filename = ('%s_%s.dump' % (video_id, url)) filename = sanitize_filename(raw_filename, restricted=True) @@ -246,7 +251,21 @@ class InfoExtractor(object): with open(filename, 'wb') as outf: outf.write(webpage_bytes) - content = webpage_bytes.decode(encoding, 'replace') + try: + content = webpage_bytes.decode(encoding, 'replace') + except LookupError: + content = webpage_bytes.decode('utf-8', 'replace') + + if (u'Access to this site is blocked' in content and + u'Websense' in content[:512]): + msg = u'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'