request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
try:
self.report_login()
- login_results = compat_urllib_request.urlopen(request).read()
+ login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
return
request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
try:
self.report_age_confirmation()
- age_results = compat_urllib_request.urlopen(request).read()
+ age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
return
self.report_video_subtitles_download(video_id)
request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
try:
- srt_list = compat_urllib_request.urlopen(request).read()
+ srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
raise Trouble(u'WARNING: no closed captions found in the specified language')
request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
try:
- srt_xml = compat_urllib_request.urlopen(request).read()
+ srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
if not srt_xml:
raise Trouble(u'WARNING: unable to download video subtitles')
- video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
+ video_subtitles = self._closed_captions_xml_to_srt(srt_xml)
except Trouble as trouble:
- self._downloader.trouble(trouble[0])
+ self._downloader.trouble(str(trouble))
if 'length_seconds' not in video_info:
self._downloader.trouble(u'WARNING: unable to extract video duration')
request.add_header('Cookie', 'family_filter=off')
try:
self.report_download_webpage(video_id)
- webpage = compat_urllib_request.urlopen(request).read()
+ webpage_bytes = compat_urllib_request.urlopen(request).read()
+ webpage = webpage_bytes.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
return
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
- video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
+ video_title = unescapeHTML(mobj.group('title'))
video_uploader = None
mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
return [{
- 'id': video_id.decode('utf-8'),
- 'url': video_url.decode('utf-8'),
- 'uploader': video_uploader.decode('utf-8'),
+ 'id': video_id,
+ 'url': video_url,
+ 'uploader': video_uploader,
'upload_date': video_upload_date,
'title': video_title,
- 'ext': video_extension.decode('utf-8'),
- }]
-
-
-class GoogleIE(InfoExtractor):
- """Information extractor for video.google.com."""
-
- _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
- IE_NAME = u'video.google'
-
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
-
- def report_download_webpage(self, video_id):
- """Report webpage download."""
- self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
-
- def report_extraction(self, video_id):
- """Report information extraction."""
- self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
-
- def _real_extract(self, url):
- # Extract id from URL
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
- return
-
- video_id = mobj.group(1)
-
- video_extension = 'mp4'
-
- # Retrieve video webpage to extract further information
- request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
- try:
- self.report_download_webpage(video_id)
- webpage = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
- return
-
- # Extract URL, uploader, and title from webpage
- self.report_extraction(video_id)
- mobj = re.search(r"download_url:'([^']+)'", webpage)
- if mobj is None:
- video_extension = 'flv'
- mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract media URL')
- return
- mediaURL = compat_urllib_parse.unquote(mobj.group(1))
- mediaURL = mediaURL.replace('\\x3d', '\x3d')
- mediaURL = mediaURL.replace('\\x26', '\x26')
-
- video_url = mediaURL
-
- mobj = re.search(r'<title>(.*)</title>', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract title')
- return
- video_title = mobj.group(1).decode('utf-8')
-
- # Extract video description
- mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video description')
- return
- video_description = mobj.group(1).decode('utf-8')
- if not video_description:
- video_description = 'No description available.'
-
- # Extract video thumbnail
- if self._downloader.params.get('forcethumbnail', False):
- request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
- try:
- webpage = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
- return
- mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
- return
- video_thumbnail = mobj.group(1)
- else: # we need something to pass to process_info
- video_thumbnail = ''
-
- return [{
- 'id': video_id.decode('utf-8'),
- 'url': video_url.decode('utf-8'),
- 'uploader': None,
- 'upload_date': None,
- 'title': video_title,
- 'ext': video_extension.decode('utf-8'),
+ 'ext': video_extension,
}]
class YahooIE(InfoExtractor):
"""Information extractor for video.yahoo.com."""
+ _WORKING = False
# _VALID_URL matches all Yahoo! Video URLs
# _VPAGE_URL matches only the extractable '/watch/' URLs
_VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
request = compat_urllib_request.Request(url, None, std_headers)
try:
self.report_download_webpage(video_id)
- webpage = compat_urllib_request.urlopen(request).read()
+ webpage_bytes = compat_urllib_request.urlopen(request).read()
+ webpage = webpage_bytes.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
return
video_thumbnail = config["video"]["thumbnail"]
# Extract video description
- video_description = get_element_by_id("description", webpage.decode('utf8'))
+ video_description = get_element_by_id("description", webpage)
if video_description: video_description = clean_html(video_description)
else: video_description = ''
'url': compat_urllib_parse.unquote(info.get('url')),
'uploader': u'arte.tv',
'upload_date': info.get('date'),
- 'title': info.get('title'),
+ 'title': info.get('title').decode('utf-8'),
'ext': u'mp4',
'format': u'NA',
'player_url': None,
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
- video_title = mobj.group(1).decode('utf-8')
+ video_title = mobj.group(1)
# video uploader is domain name
mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
- video_uploader = mobj.group(1).decode('utf-8')
+ video_uploader = mobj.group(1)
return [{
- 'id': video_id.decode('utf-8'),
- 'url': video_url.decode('utf-8'),
+ 'id': video_id,
+ 'url': video_url,
'uploader': video_uploader,
'upload_date': None,
'title': video_title,
- 'ext': video_extension.decode('utf-8'),
+ 'ext': video_extension,
}]
class YahooSearchIE(InfoExtractor):
"""Information Extractor for Yahoo! Video search queries."""
+
+ _WORKING = False
_VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
_TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
_VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
request = compat_urllib_request.Request(url)
try:
- page = compat_urllib_request.urlopen(request).read().decode('utf8')
+ page = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
return
request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
try:
- page = compat_urllib_request.urlopen(request).read()
+ page = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
return
request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
try:
self.report_download_webpage(video_id)
- webpage = compat_urllib_request.urlopen(request).read()
+ webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
return
self.report_config_download(showName)
try:
- configJSON = compat_urllib_request.urlopen(configUrl).read()
+ configJSON = compat_urllib_request.urlopen(configUrl)
+ m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
+ configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
return
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
return
- video_id = mobj.group(1).decode('utf-8')
+ video_id = mobj.group(1)
self.report_webpage(video_id)
request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
try:
- webpage = compat_urllib_request.urlopen(request).read()
+ webpage_bytes = compat_urllib_request.urlopen(request).read()
+ webpage = webpage_bytes.decode('utf-8', 'replace')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
return
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video url')
return
- video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
+ video_url = compat_urllib_parse.unquote(mobj.group(1))
# Extract title
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video title')
return
- video_title = mobj.group(1).decode('utf-8')
+ video_title = mobj.group(1)
# Extract video thumbnail
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
return
- video_thumbnail = mobj.group(0).decode('utf-8')
+ video_thumbnail = mobj.group(0)
info = {
'id': video_id,
class MixcloudIE(InfoExtractor):
"""Information extractor for www.mixcloud.com"""
+
+ _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
IE_NAME = u'mixcloud'
self.report_extraction(video_id)
try:
- config = json.loads(jsondata)
+ jsonstr = jsondata.decode('utf-8')
+ config = json.loads(jsonstr)
video_title = config['data'][0]['title']
seed = config['data'][0]['seed']
fileid = config['data'][0]['streamfileids'][format]
- seg_number = len(config['data'][0]['segs'][format])
-
- keys=[]
- for i in xrange(seg_number):
- keys.append(config['data'][0]['segs'][format][i]['k'])
-
- #TODO check error
- #youku only could be viewed from mainland china
- except:
+ keys = [s['k'] for s in config['data'][0]['segs'][format]]
+ except (UnicodeDecodeError, ValueError, KeyError):
self._downloader.trouble(u'ERROR: unable to extract info section')
return
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
return
- video_id = mobj.group(1).decode('utf-8')
+ video_id = mobj.group(1)
self.report_webpage(video_id)
# Get webpage content
try:
- webpage = compat_urllib_request.urlopen(url).read()
+ webpage_bytes = compat_urllib_request.urlopen(url).read()
+ webpage = webpage_bytes.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
return
if result is None:
self._downloader.trouble(u'ERROR: unable to extract video url')
return
- video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
+ video_url = compat_urllib_parse.unquote(result.group(1))
result = re.search(self.VIDEO_TITLE_RE, webpage)
if result is None:
self._downloader.trouble(u'ERROR: unable to extract video title')
return
- video_title = result.group(1).decode('utf-8')
+ video_title = result.group(1)
result = re.search(self.VIDEO_THUMB_RE, webpage)
if result is None:
self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
return
- video_thumbnail = result.group(1).decode('utf-8')
+ video_thumbnail = result.group(1)
return [{
'id': video_id,
class GooglePlusIE(InfoExtractor):
"""Information extractor for plus.google.com."""
- _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
+ _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
IE_NAME = u'plus.google'
def __init__(self, downloader=None):
def report_extract_entry(self, url):
"""Report downloading extry"""
- self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
+ self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
def report_date(self, upload_date):
"""Report downloading extry"""
def report_uploader(self, uploader):
"""Report downloading extry"""
- self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
+ self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
def report_title(self, video_title):
"""Report downloading extry"""
- self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
+ self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
def report_extract_vid_page(self, video_page):
"""Report information extraction."""
- self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
+ self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
def _real_extract(self, url):
# Extract id from URL
return
post_url = mobj.group(0)
- video_id = mobj.group(2)
+ video_id = mobj.group(1)
video_extension = 'flv'
self.report_extract_entry(post_url)
request = compat_urllib_request.Request(post_url)
try:
- webpage = compat_urllib_request.urlopen(request).read()
+ webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
return
video_page = mobj.group(1)
request = compat_urllib_request.Request(video_page)
try:
- webpage = compat_urllib_request.urlopen(request).read()
+ webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
return
# Only get the url. The resolution part in the tuple has no use anymore
video_url = video_url[-1]
# Treat escaped \u0026 style hex
- video_url = unicode(video_url, "unicode_escape")
+ try:
+ video_url = video_url.decode("unicode_escape")
+ except AttributeError: # Python 3
+ video_url = bytes(video_url, 'ascii').decode('unicode-escape')
return [{
- 'id': video_id.decode('utf-8'),
+ 'id': video_id,
'url': video_url,
- 'uploader': uploader.decode('utf-8'),
- 'upload_date': upload_date.decode('utf-8'),
- 'title': video_title.decode('utf-8'),
- 'ext': video_extension.decode('utf-8'),
+ 'uploader': uploader,
+ 'upload_date': upload_date,
+ 'title': video_title,
+ 'ext': video_extension,
}]
+
+class NBAIE(InfoExtractor):
+ _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
+ IE_NAME = u'nba'
+
+ def report_extraction(self, video_id):
+ self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ video_id = mobj.group(1)
+ if video_id.endswith('/index.html'):
+ video_id = video_id[:-len('/index.html')]
+
+ self.report_extraction(video_id)
+ try:
+ urlh = compat_urllib_request.urlopen(url)
+ webpage_bytes = urlh.read()
+ webpage = webpage_bytes.decode('utf-8', 'ignore')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
+ return
+
+ video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
+ def _findProp(rexp, default=None):
+ m = re.search(rexp, webpage)
+ if m:
+ return unescapeHTML(m.group(1))
+ else:
+ return default
+
+ shortened_video_id = video_id.rpartition('/')[2]
+ title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
+ info = {
+ 'id': shortened_video_id,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': title,
+ 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
+ 'description': _findProp(r'<div class="description">(.*?)</h1>'),
+ }
+ return [info]
+
+class JustinTVIE(InfoExtractor):
+ """Information extractor for justin.tv and twitch.tv"""
+ # TODO: One broadcast may be split into multiple videos. The key
+ # 'broadcast_id' is the same for all parts, and 'broadcast_part'
+ # starts at 1 and increases. Can we treat all parts as one video?
+
+ _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
+ ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
+ _JUSTIN_PAGE_LIMIT = 100
+ IE_NAME = u'justin.tv'
+
+ def report_extraction(self, file_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
+
+ def report_download_page(self, channel, offset):
+ """Report attempt to download a single page of videos."""
+ self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
+ (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
+
+ # Return count of items, list of *valid* items
+ def _parse_page(self, url):
+ try:
+ urlh = compat_urllib_request.urlopen(url)
+ webpage_bytes = urlh.read()
+ webpage = webpage_bytes.decode('utf-8', 'ignore')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
+ return
+
+ response = json.loads(webpage)
+ info = []
+ for clip in response:
+ video_url = clip['video_file_url']
+ if video_url:
+ video_extension = os.path.splitext(video_url)[1][1:]
+ video_date = re.sub('-', '', clip['created_on'][:10])
+ info.append({
+ 'id': clip['id'],
+ 'url': video_url,
+ 'title': clip['title'],
+ 'uploader': clip.get('user_id', clip.get('channel_id')),
+ 'upload_date': video_date,
+ 'ext': video_extension,
+ })
+ return (len(response), info)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ api = 'http://api.justin.tv'
+ video_id = mobj.group(mobj.lastindex)
+ paged = False
+ if mobj.lastindex == 1:
+ paged = True
+ api += '/channel/archives/%s.json'
+ else:
+ api += '/clip/show/%s.json'
+ api = api % (video_id,)
+
+ self.report_extraction(video_id)
+
+ info = []
+ offset = 0
+ limit = self._JUSTIN_PAGE_LIMIT
+ while True:
+ if paged:
+ self.report_download_page(video_id, offset)
+ page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
+ page_count, page_info = self._parse_page(page_url)
+ info.extend(page_info)
+ if not paged or page_count != limit:
+ break
+ offset += limit
+ return info