2 # -*- coding: utf-8 -*-
11 import xml.etree.ElementTree
14 from urlparse import parse_qs
19 class InfoExtractor(object):
20 """Information Extractor class.
22 Information extractors are the classes that, given a URL, extract
23 information about the video (or videos) the URL refers to. This
24 information includes the real video URL, the video title, author and
25 others. The information is stored in a dictionary which is then
26 passed to the FileDownloader. The FileDownloader processes this
27 information possibly downloading the video to the file system, among
28 other possible outcomes.
30 The dictionaries must include the following fields:
34 uploader: Nickname of the video uploader, unescaped.
35 upload_date: Video upload date (YYYYMMDD).
36 title: Video title, unescaped.
37 ext: Video filename extension.
39 The following fields are optional:
41 format: The video format, defaults to ext (used for --get-format)
42 thumbnail: Full URL to a video thumbnail image.
43 description: One-line video description.
44 player_url: SWF Player URL (used for rtmpdump).
45 subtitles: The .srt file contents.
46 urlhandle: [internal] The urlHandle to be used to download the file,
47 like returned by urllib.request.urlopen
49 The fields should all be Unicode strings.
51 Subclasses of this one should re-define the _real_initialize() and
52 _real_extract() methods and define a _VALID_URL regexp.
53 Probably, they should also be added to the list of extractors.
55 _real_extract() must return a *list* of information dictionaries as
58 Finally, the _WORKING attribute should be set to False for broken IEs
59 in order to warn the users and skip the tests.
66 def __init__(self, downloader=None):
67 """Constructor. Receives an optional downloader."""
69 self.set_downloader(downloader)
71 def suitable(self, url):
72 """Receives a URL and returns True if suitable for this IE."""
73 return re.match(self._VALID_URL, url) is not None
76 """Getter method for _WORKING."""
80 """Initializes an instance (authentication, etc)."""
82 self._real_initialize()
85 def extract(self, url):
86 """Extracts URL information and returns it in list of dicts."""
88 return self._real_extract(url)
90 def set_downloader(self, downloader):
91 """Sets the downloader for this IE."""
92 self._downloader = downloader
94 def _real_initialize(self):
95 """Real initialization process. Redefine in subclasses."""
98 def _real_extract(self, url):
99 """Real extraction process. Redefine in subclasses."""
103 class YoutubeIE(InfoExtractor):
104 """Information extractor for youtube.com."""
108 (?:https?://)? # http(s):// (optional)
109 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
110 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
111 (?:.*?\#/)? # handle anchor (#/) redirect urls
112 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
113 (?: # the various things that can precede the ID:
114 (?:(?:v|embed|e)/) # v/ or embed/ or e/
115 |(?: # or the v= param in all its forms
116 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
117 (?:\?|\#!?) # the params delimiter ? or # or #!
118 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
121 )? # optional -> youtube.com/xxxx is OK
122 )? # all until now is optional -> you can pass the naked ID
123 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
124 (?(1).+)? # if we found the ID, everything can follow
126 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
127 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
128 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
129 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
130 _NETRC_MACHINE = 'youtube'
131 # Listed in order of quality
132 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
133 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
134 _video_extensions = {
140 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
146 _video_dimensions = {
164 def suitable(self, url):
165 """Receives a URL and returns True if suitable for this IE."""
166 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
168 def report_lang(self):
169 """Report attempt to set language."""
170 self._downloader.to_screen(u'[youtube] Setting language')
172 def report_login(self):
173 """Report attempt to log in."""
174 self._downloader.to_screen(u'[youtube] Logging in')
176 def report_age_confirmation(self):
177 """Report attempt to confirm age."""
178 self._downloader.to_screen(u'[youtube] Confirming age')
180 def report_video_webpage_download(self, video_id):
181 """Report attempt to download video webpage."""
182 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
184 def report_video_info_webpage_download(self, video_id):
185 """Report attempt to download video info webpage."""
186 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
188 def report_video_subtitles_download(self, video_id):
189 """Report attempt to download video info webpage."""
190 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
192 def report_information_extraction(self, video_id):
193 """Report attempt to extract video information."""
194 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
196 def report_unavailable_format(self, video_id, format):
197 """Report extracted video URL."""
198 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
200 def report_rtmp_download(self):
201 """Indicate the download will use the RTMP protocol."""
202 self._downloader.to_screen(u'[youtube] RTMP download detected')
204 def _closed_captions_xml_to_srt(self, xml_string):
206 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
207 # TODO parse xml instead of regex
208 for n, (start, dur_tag, dur, caption) in enumerate(texts):
209 if not dur: dur = '4'
211 end = start + float(dur)
212 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
213 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
214 caption = unescapeHTML(caption)
215 caption = unescapeHTML(caption) # double cycle, intentional
216 srt += str(n+1) + '\n'
217 srt += start + ' --> ' + end + '\n'
218 srt += caption + '\n\n'
221 def _print_formats(self, formats):
222 print('Available formats:')
224 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
226 def _real_initialize(self):
227 if self._downloader is None:
232 downloader_params = self._downloader.params
234 # Attempt to use provided username and password or .netrc data
235 if downloader_params.get('username', None) is not None:
236 username = downloader_params['username']
237 password = downloader_params['password']
238 elif downloader_params.get('usenetrc', False):
240 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
245 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
246 except (IOError, netrc.NetrcParseError) as err:
247 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
251 request = compat_urllib_request.Request(self._LANG_URL)
254 compat_urllib_request.urlopen(request).read()
255 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
256 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
259 # No authentication to be performed
265 'current_form': 'loginForm',
267 'action_login': 'Log In',
268 'username': username,
269 'password': password,
271 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
274 login_results = compat_urllib_request.urlopen(request).read()
275 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
276 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
278 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
279 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
285 'action_confirm': 'Confirm',
287 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
289 self.report_age_confirmation()
290 age_results = compat_urllib_request.urlopen(request).read()
291 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
292 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
295 def _real_extract(self, url):
296 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
297 mobj = re.search(self._NEXT_URL_RE, url)
299 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
301 # Extract video id from URL
302 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
304 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
306 video_id = mobj.group(2)
309 self.report_video_webpage_download(video_id)
310 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
312 video_webpage = compat_urllib_request.urlopen(request).read()
313 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
314 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
317 # Attempt to extract SWF player URL
318 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
320 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
325 self.report_video_info_webpage_download(video_id)
326 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
327 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
328 % (video_id, el_type))
329 request = compat_urllib_request.Request(video_info_url)
331 video_info_webpage = compat_urllib_request.urlopen(request).read()
332 video_info = parse_qs(video_info_webpage)
333 if 'token' in video_info:
335 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
336 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
338 if 'token' not in video_info:
339 if 'reason' in video_info:
340 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
342 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
345 # Check for "rental" videos
346 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
347 self._downloader.trouble(u'ERROR: "rental" videos not supported')
350 # Start extracting information
351 self.report_information_extraction(video_id)
354 if 'author' not in video_info:
355 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
357 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
360 if 'title' not in video_info:
361 self._downloader.trouble(u'ERROR: unable to extract video title')
363 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
364 video_title = video_title.decode('utf-8')
367 if 'thumbnail_url' not in video_info:
368 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
370 else: # don't panic if we can't find it
371 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
375 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
377 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
378 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
379 for expression in format_expressions:
381 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
386 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
387 if video_description: video_description = clean_html(video_description)
388 else: video_description = ''
391 video_subtitles = None
392 if self._downloader.params.get('writesubtitles', False):
394 self.report_video_subtitles_download(video_id)
395 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
397 srt_list = compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
400 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
401 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
402 if not srt_lang_list:
403 raise Trouble(u'WARNING: video has no closed captions')
404 if self._downloader.params.get('subtitleslang', False):
405 srt_lang = self._downloader.params.get('subtitleslang')
406 elif 'en' in srt_lang_list:
409 srt_lang = srt_lang_list.keys()[0]
410 if not srt_lang in srt_lang_list:
411 raise Trouble(u'WARNING: no closed captions found in the specified language')
412 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
414 srt_xml = compat_urllib_request.urlopen(request).read()
415 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
416 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
418 raise Trouble(u'WARNING: unable to download video subtitles')
419 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
420 except Trouble as trouble:
421 self._downloader.trouble(trouble[0])
423 if 'length_seconds' not in video_info:
424 self._downloader.trouble(u'WARNING: unable to extract video duration')
427 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
430 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
432 # Decide which formats to download
433 req_format = self._downloader.params.get('format', None)
435 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
436 self.report_rtmp_download()
437 video_url_list = [(None, video_info['conn'][0])]
438 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
439 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
440 url_data = [parse_qs(uds) for uds in url_data_strs]
441 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
442 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
444 format_limit = self._downloader.params.get('format_limit', None)
445 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
446 if format_limit is not None and format_limit in available_formats:
447 format_list = available_formats[available_formats.index(format_limit):]
449 format_list = available_formats
450 existing_formats = [x for x in format_list if x in url_map]
451 if len(existing_formats) == 0:
452 self._downloader.trouble(u'ERROR: no known formats available for video')
454 if self._downloader.params.get('listformats', None):
455 self._print_formats(existing_formats)
457 if req_format is None or req_format == 'best':
458 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
459 elif req_format == 'worst':
460 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
461 elif req_format in ('-1', 'all'):
462 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
464 # Specific formats. We pick the first in a slash-delimeted sequence.
465 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
466 req_formats = req_format.split('/')
467 video_url_list = None
468 for rf in req_formats:
470 video_url_list = [(rf, url_map[rf])]
472 if video_url_list is None:
473 self._downloader.trouble(u'ERROR: requested format not available')
476 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
480 for format_param, video_real_url in video_url_list:
482 video_extension = self._video_extensions.get(format_param, 'flv')
484 video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
485 self._video_dimensions.get(format_param, '???'))
488 'id': video_id.decode('utf-8'),
489 'url': video_real_url.decode('utf-8'),
490 'uploader': video_uploader.decode('utf-8'),
491 'upload_date': upload_date,
492 'title': video_title,
493 'ext': video_extension.decode('utf-8'),
494 'format': video_format,
495 'thumbnail': video_thumbnail.decode('utf-8'),
496 'description': video_description,
497 'player_url': player_url,
498 'subtitles': video_subtitles,
499 'duration': video_duration
504 class MetacafeIE(InfoExtractor):
505 """Information Extractor for metacafe.com."""
507 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
508 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
509 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
510 IE_NAME = u'metacafe'
512 def __init__(self, downloader=None):
513 InfoExtractor.__init__(self, downloader)
515 def report_disclaimer(self):
516 """Report disclaimer retrieval."""
517 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
519 def report_age_confirmation(self):
520 """Report attempt to confirm age."""
521 self._downloader.to_screen(u'[metacafe] Confirming age')
523 def report_download_webpage(self, video_id):
524 """Report webpage download."""
525 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
527 def report_extraction(self, video_id):
528 """Report information extraction."""
529 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
531 def _real_initialize(self):
532 # Retrieve disclaimer
533 request = compat_urllib_request.Request(self._DISCLAIMER)
535 self.report_disclaimer()
536 disclaimer = compat_urllib_request.urlopen(request).read()
537 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
538 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
544 'submit': "Continue - I'm over 18",
546 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
548 self.report_age_confirmation()
549 disclaimer = compat_urllib_request.urlopen(request).read()
550 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
551 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
554 def _real_extract(self, url):
555 # Extract id and simplified title from URL
556 mobj = re.match(self._VALID_URL, url)
558 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
561 video_id = mobj.group(1)
563 # Check if video comes from YouTube
564 mobj2 = re.match(r'^yt-(.*)$', video_id)
565 if mobj2 is not None:
566 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
569 # Retrieve video webpage to extract further information
570 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
572 self.report_download_webpage(video_id)
573 webpage = compat_urllib_request.urlopen(request).read()
574 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
575 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
578 # Extract URL, uploader and title from webpage
579 self.report_extraction(video_id)
580 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
582 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
583 video_extension = mediaURL[-3:]
585 # Extract gdaKey if available
586 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
590 gdaKey = mobj.group(1)
591 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
593 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
595 self._downloader.trouble(u'ERROR: unable to extract media URL')
597 vardict = parse_qs(mobj.group(1))
598 if 'mediaData' not in vardict:
599 self._downloader.trouble(u'ERROR: unable to extract media URL')
601 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
603 self._downloader.trouble(u'ERROR: unable to extract media URL')
605 mediaURL = mobj.group(1).replace('\\/', '/')
606 video_extension = mediaURL[-3:]
607 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
609 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
611 self._downloader.trouble(u'ERROR: unable to extract title')
613 video_title = mobj.group(1).decode('utf-8')
615 mobj = re.search(r'submitter=(.*?);', webpage)
617 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
619 video_uploader = mobj.group(1)
622 'id': video_id.decode('utf-8'),
623 'url': video_url.decode('utf-8'),
624 'uploader': video_uploader.decode('utf-8'),
626 'title': video_title,
627 'ext': video_extension.decode('utf-8'),
631 class DailymotionIE(InfoExtractor):
632 """Information Extractor for Dailymotion"""
634 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
635 IE_NAME = u'dailymotion'
637 def __init__(self, downloader=None):
638 InfoExtractor.__init__(self, downloader)
640 def report_download_webpage(self, video_id):
641 """Report webpage download."""
642 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
644 def report_extraction(self, video_id):
645 """Report information extraction."""
646 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
648 def _real_extract(self, url):
649 # Extract id and simplified title from URL
650 mobj = re.match(self._VALID_URL, url)
652 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
655 video_id = mobj.group(1).split('_')[0].split('?')[0]
657 video_extension = 'mp4'
659 # Retrieve video webpage to extract further information
660 request = compat_urllib_request.Request(url)
661 request.add_header('Cookie', 'family_filter=off')
663 self.report_download_webpage(video_id)
664 webpage = compat_urllib_request.urlopen(request).read()
665 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
666 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
669 # Extract URL, uploader and title from webpage
670 self.report_extraction(video_id)
671 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
673 self._downloader.trouble(u'ERROR: unable to extract media URL')
675 flashvars = compat_urllib_parse.unquote(mobj.group(1))
677 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
680 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
683 self._downloader.trouble(u'ERROR: unable to extract video URL')
686 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
688 self._downloader.trouble(u'ERROR: unable to extract video URL')
691 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
693 # TODO: support choosing qualities
695 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
697 self._downloader.trouble(u'ERROR: unable to extract title')
699 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
701 video_uploader = None
702 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
704 # lookin for official user
705 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
706 if mobj_official is None:
707 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
709 video_uploader = mobj_official.group(1)
711 video_uploader = mobj.group(1)
713 video_upload_date = None
714 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
716 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
719 'id': video_id.decode('utf-8'),
720 'url': video_url.decode('utf-8'),
721 'uploader': video_uploader.decode('utf-8'),
722 'upload_date': video_upload_date,
723 'title': video_title,
724 'ext': video_extension.decode('utf-8'),
728 class GoogleIE(InfoExtractor):
729 """Information extractor for video.google.com."""
731 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
732 IE_NAME = u'video.google'
734 def __init__(self, downloader=None):
735 InfoExtractor.__init__(self, downloader)
737 def report_download_webpage(self, video_id):
738 """Report webpage download."""
739 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
741 def report_extraction(self, video_id):
742 """Report information extraction."""
743 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
745 def _real_extract(self, url):
746 # Extract id from URL
747 mobj = re.match(self._VALID_URL, url)
749 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
752 video_id = mobj.group(1)
754 video_extension = 'mp4'
756 # Retrieve video webpage to extract further information
757 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
759 self.report_download_webpage(video_id)
760 webpage = compat_urllib_request.urlopen(request).read()
761 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
762 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
765 # Extract URL, uploader, and title from webpage
766 self.report_extraction(video_id)
767 mobj = re.search(r"download_url:'([^']+)'", webpage)
769 video_extension = 'flv'
770 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
772 self._downloader.trouble(u'ERROR: unable to extract media URL')
774 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
775 mediaURL = mediaURL.replace('\\x3d', '\x3d')
776 mediaURL = mediaURL.replace('\\x26', '\x26')
780 mobj = re.search(r'<title>(.*)</title>', webpage)
782 self._downloader.trouble(u'ERROR: unable to extract title')
784 video_title = mobj.group(1).decode('utf-8')
786 # Extract video description
787 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
789 self._downloader.trouble(u'ERROR: unable to extract video description')
791 video_description = mobj.group(1).decode('utf-8')
792 if not video_description:
793 video_description = 'No description available.'
795 # Extract video thumbnail
796 if self._downloader.params.get('forcethumbnail', False):
797 request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
799 webpage = compat_urllib_request.urlopen(request).read()
800 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
801 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
803 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
805 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
807 video_thumbnail = mobj.group(1)
808 else: # we need something to pass to process_info
812 'id': video_id.decode('utf-8'),
813 'url': video_url.decode('utf-8'),
816 'title': video_title,
817 'ext': video_extension.decode('utf-8'),
821 class PhotobucketIE(InfoExtractor):
822 """Information extractor for photobucket.com."""
824 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
825 IE_NAME = u'photobucket'
827 def __init__(self, downloader=None):
828 InfoExtractor.__init__(self, downloader)
830 def report_download_webpage(self, video_id):
831 """Report webpage download."""
832 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
834 def report_extraction(self, video_id):
835 """Report information extraction."""
836 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
838 def _real_extract(self, url):
839 # Extract id from URL
840 mobj = re.match(self._VALID_URL, url)
842 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
845 video_id = mobj.group(1)
847 video_extension = 'flv'
849 # Retrieve video webpage to extract further information
850 request = compat_urllib_request.Request(url)
852 self.report_download_webpage(video_id)
853 webpage = compat_urllib_request.urlopen(request).read()
854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
855 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
858 # Extract URL, uploader, and title from webpage
859 self.report_extraction(video_id)
860 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
862 self._downloader.trouble(u'ERROR: unable to extract media URL')
864 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
868 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
870 self._downloader.trouble(u'ERROR: unable to extract title')
872 video_title = mobj.group(1).decode('utf-8')
874 video_uploader = mobj.group(2).decode('utf-8')
877 'id': video_id.decode('utf-8'),
878 'url': video_url.decode('utf-8'),
879 'uploader': video_uploader,
881 'title': video_title,
882 'ext': video_extension.decode('utf-8'),
886 class YahooIE(InfoExtractor):
887 """Information extractor for video.yahoo.com."""
889 # _VALID_URL matches all Yahoo! Video URLs
890 # _VPAGE_URL matches only the extractable '/watch/' URLs
891 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
892 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
893 IE_NAME = u'video.yahoo'
895 def __init__(self, downloader=None):
896 InfoExtractor.__init__(self, downloader)
898 def report_download_webpage(self, video_id):
899 """Report webpage download."""
900 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
902 def report_extraction(self, video_id):
903 """Report information extraction."""
904 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
906 def _real_extract(self, url, new_video=True):
907 # Extract ID from URL
908 mobj = re.match(self._VALID_URL, url)
910 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
913 video_id = mobj.group(2)
914 video_extension = 'flv'
916 # Rewrite valid but non-extractable URLs as
917 # extractable English language /watch/ URLs
918 if re.match(self._VPAGE_URL, url) is None:
919 request = compat_urllib_request.Request(url)
921 webpage = compat_urllib_request.urlopen(request).read()
922 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
923 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
926 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
928 self._downloader.trouble(u'ERROR: Unable to extract id field')
930 yahoo_id = mobj.group(1)
932 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
934 self._downloader.trouble(u'ERROR: Unable to extract vid field')
936 yahoo_vid = mobj.group(1)
938 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
939 return self._real_extract(url, new_video=False)
941 # Retrieve video webpage to extract further information
942 request = compat_urllib_request.Request(url)
944 self.report_download_webpage(video_id)
945 webpage = compat_urllib_request.urlopen(request).read()
946 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
947 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
950 # Extract uploader and title from webpage
951 self.report_extraction(video_id)
952 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
954 self._downloader.trouble(u'ERROR: unable to extract video title')
956 video_title = mobj.group(1).decode('utf-8')
958 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
960 self._downloader.trouble(u'ERROR: unable to extract video uploader')
962 video_uploader = mobj.group(1).decode('utf-8')
964 # Extract video thumbnail
965 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
967 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
969 video_thumbnail = mobj.group(1).decode('utf-8')
971 # Extract video description
972 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
974 self._downloader.trouble(u'ERROR: unable to extract video description')
976 video_description = mobj.group(1).decode('utf-8')
977 if not video_description:
978 video_description = 'No description available.'
980 # Extract video height and width
981 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
983 self._downloader.trouble(u'ERROR: unable to extract video height')
985 yv_video_height = mobj.group(1)
987 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
989 self._downloader.trouble(u'ERROR: unable to extract video width')
991 yv_video_width = mobj.group(1)
993 # Retrieve video playlist to extract media URL
994 # I'm not completely sure what all these options are, but we
995 # seem to need most of them, otherwise the server sends a 401.
996 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
997 yv_bitrate = '700' # according to Wikipedia this is hard-coded
998 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
999 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1000 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1002 self.report_download_webpage(video_id)
1003 webpage = compat_urllib_request.urlopen(request).read()
1004 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1005 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1008 # Extract media URL from playlist XML
1009 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1011 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1013 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1014 video_url = unescapeHTML(video_url)
1017 'id': video_id.decode('utf-8'),
1019 'uploader': video_uploader,
1020 'upload_date': None,
1021 'title': video_title,
1022 'ext': video_extension.decode('utf-8'),
1023 'thumbnail': video_thumbnail.decode('utf-8'),
1024 'description': video_description,
1028 class VimeoIE(InfoExtractor):
1029 """Information extractor for vimeo.com."""
1031 # _VALID_URL matches Vimeo URLs
1032 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1035 def __init__(self, downloader=None):
1036 InfoExtractor.__init__(self, downloader)
1038 def report_download_webpage(self, video_id):
1039 """Report webpage download."""
1040 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1042 def report_extraction(self, video_id):
1043 """Report information extraction."""
1044 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1046 def _real_extract(self, url, new_video=True):
1047 # Extract ID from URL
1048 mobj = re.match(self._VALID_URL, url)
1050 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1053 video_id = mobj.group(1)
1055 # Retrieve video webpage to extract further information
1056 request = compat_urllib_request.Request(url, None, std_headers)
1058 self.report_download_webpage(video_id)
1059 webpage = compat_urllib_request.urlopen(request).read()
1060 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1061 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1064 # Now we begin extracting as much information as we can from what we
1065 # retrieved. First we extract the information common to all extractors,
1066 # and latter we extract those that are Vimeo specific.
1067 self.report_extraction(video_id)
1069 # Extract the config JSON
1070 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072 config = json.loads(config)
1074 self._downloader.trouble(u'ERROR: unable to extract info section')
1078 video_title = config["video"]["title"]
1081 video_uploader = config["video"]["owner"]["name"]
1083 # Extract video thumbnail
1084 video_thumbnail = config["video"]["thumbnail"]
1086 # Extract video description
1087 video_description = get_element_by_id("description", webpage.decode('utf8'))
1088 if video_description: video_description = clean_html(video_description)
1089 else: video_description = ''
1091 # Extract upload date
1092 video_upload_date = None
1093 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1094 if mobj is not None:
1095 video_upload_date = mobj.group(1)
1097 # Vimeo specific: extract request signature and timestamp
1098 sig = config['request']['signature']
1099 timestamp = config['request']['timestamp']
1101 # Vimeo specific: extract video codec and quality information
1102 # First consider quality, then codecs, then take everything
1103 # TODO bind to format param
1104 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1105 files = { 'hd': [], 'sd': [], 'other': []}
1106 for codec_name, codec_extension in codecs:
1107 if codec_name in config["video"]["files"]:
1108 if 'hd' in config["video"]["files"][codec_name]:
1109 files['hd'].append((codec_name, codec_extension, 'hd'))
1110 elif 'sd' in config["video"]["files"][codec_name]:
1111 files['sd'].append((codec_name, codec_extension, 'sd'))
1113 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1115 for quality in ('hd', 'sd', 'other'):
1116 if len(files[quality]) > 0:
1117 video_quality = files[quality][0][2]
1118 video_codec = files[quality][0][0]
1119 video_extension = files[quality][0][1]
1120 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1123 self._downloader.trouble(u'ERROR: no known codec found')
1126 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1127 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1132 'uploader': video_uploader,
1133 'upload_date': video_upload_date,
1134 'title': video_title,
1135 'ext': video_extension,
1136 'thumbnail': video_thumbnail,
1137 'description': video_description,
1141 class ArteTvIE(InfoExtractor):
1142 """arte.tv information extractor."""
1144 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1145 _LIVE_URL = r'index-[0-9]+\.html$'
1147 IE_NAME = u'arte.tv'
1149 def __init__(self, downloader=None):
1150 InfoExtractor.__init__(self, downloader)
1152 def report_download_webpage(self, video_id):
1153 """Report webpage download."""
1154 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1156 def report_extraction(self, video_id):
1157 """Report information extraction."""
1158 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1160 def fetch_webpage(self, url):
1161 self._downloader.increment_downloads()
1162 request = compat_urllib_request.Request(url)
1164 self.report_download_webpage(url)
1165 webpage = compat_urllib_request.urlopen(request).read()
1166 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1167 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1169 except ValueError as err:
1170 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1174 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1175 page = self.fetch_webpage(url)
1176 mobj = re.search(regex, page, regexFlags)
1180 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1183 for (i, key, err) in matchTuples:
1184 if mobj.group(i) is None:
1185 self._downloader.trouble(err)
1188 info[key] = mobj.group(i)
1192 def extractLiveStream(self, url):
1193 video_lang = url.split('/')[-4]
1194 info = self.grep_webpage(
1196 r'src="(.*?/videothek_js.*?\.js)',
1199 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1202 http_host = url.split('/')[2]
1203 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1204 info = self.grep_webpage(
1206 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1207 '(http://.*?\.swf).*?' +
1211 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1212 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1213 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1216 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1218 def extractPlus7Stream(self, url):
1219 video_lang = url.split('/')[-3]
1220 info = self.grep_webpage(
1222 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1225 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1228 next_url = compat_urllib_parse.unquote(info.get('url'))
1229 info = self.grep_webpage(
1231 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1234 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1237 next_url = compat_urllib_parse.unquote(info.get('url'))
1239 info = self.grep_webpage(
1241 r'<video id="(.*?)".*?>.*?' +
1242 '<name>(.*?)</name>.*?' +
1243 '<dateVideo>(.*?)</dateVideo>.*?' +
1244 '<url quality="hd">(.*?)</url>',
1247 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1248 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1249 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1250 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1255 'id': info.get('id'),
1256 'url': compat_urllib_parse.unquote(info.get('url')),
1257 'uploader': u'arte.tv',
1258 'upload_date': info.get('date'),
1259 'title': info.get('title'),
1265 def _real_extract(self, url):
1266 video_id = url.split('/')[-1]
1267 self.report_extraction(video_id)
1269 if re.search(self._LIVE_URL, video_id) is not None:
1270 self.extractLiveStream(url)
1273 info = self.extractPlus7Stream(url)
1278 class GenericIE(InfoExtractor):
1279 """Generic last-resort information extractor."""
1282 IE_NAME = u'generic'
1284 def __init__(self, downloader=None):
1285 InfoExtractor.__init__(self, downloader)
1287 def report_download_webpage(self, video_id):
1288 """Report webpage download."""
1289 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1290 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1292 def report_extraction(self, video_id):
1293 """Report information extraction."""
1294 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1296 def report_following_redirect(self, new_url):
1297 """Report information extraction."""
1298 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1300 def _test_redirect(self, url):
1301 """Check if it is a redirect, like url shorteners, in case restart chain."""
1302 class HeadRequest(compat_urllib_request.Request):
1303 def get_method(self):
1306 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1308 Subclass the HTTPRedirectHandler to make it use our
1309 HeadRequest also on the redirected URL
1311 def redirect_request(self, req, fp, code, msg, headers, newurl):
1312 if code in (301, 302, 303, 307):
1313 newurl = newurl.replace(' ', '%20')
1314 newheaders = dict((k,v) for k,v in req.headers.items()
1315 if k.lower() not in ("content-length", "content-type"))
1316 return HeadRequest(newurl,
1318 origin_req_host=req.get_origin_req_host(),
1321 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1323 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1325 Fallback to GET if HEAD is not allowed (405 HTTP error)
1327 def http_error_405(self, req, fp, code, msg, headers):
1331 newheaders = dict((k,v) for k,v in req.headers.items()
1332 if k.lower() not in ("content-length", "content-type"))
1333 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1335 origin_req_host=req.get_origin_req_host(),
1339 opener = compat_urllib_request.OpenerDirector()
1340 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1341 HTTPMethodFallback, HEADRedirectHandler,
1342 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1343 opener.add_handler(handler())
1345 response = opener.open(HeadRequest(url))
1346 new_url = response.geturl()
1351 self.report_following_redirect(new_url)
1352 self._downloader.download([new_url])
1355 def _real_extract(self, url):
1356 if self._test_redirect(url): return
1358 video_id = url.split('/')[-1]
1359 request = compat_urllib_request.Request(url)
1361 self.report_download_webpage(video_id)
1362 webpage = compat_urllib_request.urlopen(request).read()
1363 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1364 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1366 except ValueError as err:
1367 # since this is the last-resort InfoExtractor, if
1368 # this error is thrown, it'll be thrown here
1369 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1372 self.report_extraction(video_id)
1373 # Start with something easy: JW Player in SWFObject
1374 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1376 # Broaden the search a little bit
1377 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1379 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1382 # It's possible that one of the regexes
1383 # matched, but returned an empty group:
1384 if mobj.group(1) is None:
1385 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1388 video_url = compat_urllib_parse.unquote(mobj.group(1))
1389 video_id = os.path.basename(video_url)
1391 # here's a fun little line of code for you:
1392 video_extension = os.path.splitext(video_id)[1][1:]
1393 video_id = os.path.splitext(video_id)[0]
1395 # it's tempting to parse this further, but you would
1396 # have to take into account all the variations like
1397 # Video Title - Site Name
1398 # Site Name | Video Title
1399 # Video Title - Tagline | Site Name
1400 # and so on and so forth; it's just not practical
1401 mobj = re.search(r'<title>(.*)</title>', webpage)
1403 self._downloader.trouble(u'ERROR: unable to extract title')
1405 video_title = mobj.group(1).decode('utf-8')
1407 # video uploader is domain name
1408 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1410 self._downloader.trouble(u'ERROR: unable to extract title')
1412 video_uploader = mobj.group(1).decode('utf-8')
1415 'id': video_id.decode('utf-8'),
1416 'url': video_url.decode('utf-8'),
1417 'uploader': video_uploader,
1418 'upload_date': None,
1419 'title': video_title,
1420 'ext': video_extension.decode('utf-8'),
1424 class YoutubeSearchIE(InfoExtractor):
1425 """Information Extractor for YouTube search queries."""
1426 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1427 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1428 _max_youtube_results = 1000
1429 IE_NAME = u'youtube:search'
1431 def __init__(self, downloader=None):
1432 InfoExtractor.__init__(self, downloader)
1434 def report_download_page(self, query, pagenum):
1435 """Report attempt to download search page with given number."""
1436 query = query.decode(preferredencoding())
1437 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1439 def _real_extract(self, query):
1440 mobj = re.match(self._VALID_URL, query)
1442 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1445 prefix, query = query.split(':')
1447 query = query.encode('utf-8')
1449 self._download_n_results(query, 1)
1451 elif prefix == 'all':
1452 self._download_n_results(query, self._max_youtube_results)
1458 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1460 elif n > self._max_youtube_results:
1461 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1462 n = self._max_youtube_results
1463 self._download_n_results(query, n)
1465 except ValueError: # parsing prefix as integer fails
1466 self._download_n_results(query, 1)
1469 def _download_n_results(self, query, n):
1470 """Downloads a specified number of results for a query"""
1476 while (50 * pagenum) < limit:
1477 self.report_download_page(query, pagenum+1)
1478 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1479 request = compat_urllib_request.Request(result_url)
1481 data = compat_urllib_request.urlopen(request).read()
1482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1483 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1485 api_response = json.loads(data)['data']
1487 new_ids = list(video['id'] for video in api_response['items'])
1488 video_ids += new_ids
1490 limit = min(n, api_response['totalItems'])
1493 if len(video_ids) > n:
1494 video_ids = video_ids[:n]
1495 for id in video_ids:
1496 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1500 class GoogleSearchIE(InfoExtractor):
1501 """Information Extractor for Google Video search queries."""
1502 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1503 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1504 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1505 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1506 _max_google_results = 1000
1507 IE_NAME = u'video.google:search'
1509 def __init__(self, downloader=None):
1510 InfoExtractor.__init__(self, downloader)
1512 def report_download_page(self, query, pagenum):
1513 """Report attempt to download playlist page with given number."""
1514 query = query.decode(preferredencoding())
1515 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1517 def _real_extract(self, query):
1518 mobj = re.match(self._VALID_URL, query)
1520 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1523 prefix, query = query.split(':')
1525 query = query.encode('utf-8')
1527 self._download_n_results(query, 1)
1529 elif prefix == 'all':
1530 self._download_n_results(query, self._max_google_results)
1536 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1538 elif n > self._max_google_results:
1539 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1540 n = self._max_google_results
1541 self._download_n_results(query, n)
1543 except ValueError: # parsing prefix as integer fails
1544 self._download_n_results(query, 1)
1547 def _download_n_results(self, query, n):
1548 """Downloads a specified number of results for a query"""
1554 self.report_download_page(query, pagenum)
1555 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1556 request = compat_urllib_request.Request(result_url)
1558 page = compat_urllib_request.urlopen(request).read()
1559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1560 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1563 # Extract video identifiers
1564 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1565 video_id = mobj.group(1)
1566 if video_id not in video_ids:
1567 video_ids.append(video_id)
1568 if len(video_ids) == n:
1569 # Specified n videos reached
1570 for id in video_ids:
1571 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1575 for id in video_ids:
1576 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579 pagenum = pagenum + 1
1582 class YahooSearchIE(InfoExtractor):
1583 """Information Extractor for Yahoo! Video search queries."""
1584 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1585 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1586 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1587 _MORE_PAGES_INDICATOR = r'\s*Next'
1588 _max_yahoo_results = 1000
1589 IE_NAME = u'video.yahoo:search'
1591 def __init__(self, downloader=None):
1592 InfoExtractor.__init__(self, downloader)
1594 def report_download_page(self, query, pagenum):
1595 """Report attempt to download playlist page with given number."""
1596 query = query.decode(preferredencoding())
1597 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1599 def _real_extract(self, query):
1600 mobj = re.match(self._VALID_URL, query)
1602 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1605 prefix, query = query.split(':')
1607 query = query.encode('utf-8')
1609 self._download_n_results(query, 1)
1611 elif prefix == 'all':
1612 self._download_n_results(query, self._max_yahoo_results)
1618 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1620 elif n > self._max_yahoo_results:
1621 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1622 n = self._max_yahoo_results
1623 self._download_n_results(query, n)
1625 except ValueError: # parsing prefix as integer fails
1626 self._download_n_results(query, 1)
1629 def _download_n_results(self, query, n):
1630 """Downloads a specified number of results for a query"""
1633 already_seen = set()
1637 self.report_download_page(query, pagenum)
1638 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1639 request = compat_urllib_request.Request(result_url)
1641 page = compat_urllib_request.urlopen(request).read()
1642 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1643 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1646 # Extract video identifiers
1647 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1648 video_id = mobj.group(1)
1649 if video_id not in already_seen:
1650 video_ids.append(video_id)
1651 already_seen.add(video_id)
1652 if len(video_ids) == n:
1653 # Specified n videos reached
1654 for id in video_ids:
1655 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1658 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1659 for id in video_ids:
1660 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1663 pagenum = pagenum + 1
1666 class YoutubePlaylistIE(InfoExtractor):
1667 """Information Extractor for YouTube playlists."""
1669 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1670 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1671 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1672 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1673 IE_NAME = u'youtube:playlist'
1675 def __init__(self, downloader=None):
1676 InfoExtractor.__init__(self, downloader)
1678 def report_download_page(self, playlist_id, pagenum):
1679 """Report attempt to download playlist page with given number."""
1680 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1682 def _real_extract(self, url):
1683 # Extract playlist id
1684 mobj = re.match(self._VALID_URL, url)
1686 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1690 if mobj.group(3) is not None:
1691 self._downloader.download([mobj.group(3)])
1694 # Download playlist pages
1695 # prefix is 'p' as default for playlists but there are other types that need extra care
1696 playlist_prefix = mobj.group(1)
1697 if playlist_prefix == 'a':
1698 playlist_access = 'artist'
1700 playlist_prefix = 'p'
1701 playlist_access = 'view_play_list'
1702 playlist_id = mobj.group(2)
1707 self.report_download_page(playlist_id, pagenum)
1708 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1709 request = compat_urllib_request.Request(url)
1711 page = compat_urllib_request.urlopen(request).read()
1712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1716 # Extract video identifiers
1718 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1719 if mobj.group(1) not in ids_in_page:
1720 ids_in_page.append(mobj.group(1))
1721 video_ids.extend(ids_in_page)
1723 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1725 pagenum = pagenum + 1
1727 playliststart = self._downloader.params.get('playliststart', 1) - 1
1728 playlistend = self._downloader.params.get('playlistend', -1)
1729 if playlistend == -1:
1730 video_ids = video_ids[playliststart:]
1732 video_ids = video_ids[playliststart:playlistend]
1734 for id in video_ids:
1735 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1739 class YoutubeChannelIE(InfoExtractor):
1740 """Information Extractor for YouTube channels."""
1742 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1743 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1744 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1745 IE_NAME = u'youtube:channel'
1747 def report_download_page(self, channel_id, pagenum):
1748 """Report attempt to download channel page with given number."""
1749 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1751 def _real_extract(self, url):
1752 # Extract channel id
1753 mobj = re.match(self._VALID_URL, url)
1755 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1758 # Download channel pages
1759 channel_id = mobj.group(1)
1764 self.report_download_page(channel_id, pagenum)
1765 url = self._TEMPLATE_URL % (channel_id, pagenum)
1766 request = compat_urllib_request.Request(url)
1768 page = compat_urllib_request.urlopen(request).read()
1769 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1770 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1773 # Extract video identifiers
1775 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1776 if mobj.group(1) not in ids_in_page:
1777 ids_in_page.append(mobj.group(1))
1778 video_ids.extend(ids_in_page)
1780 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1782 pagenum = pagenum + 1
1784 for id in video_ids:
1785 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1789 class YoutubeUserIE(InfoExtractor):
1790 """Information Extractor for YouTube users."""
1792 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1793 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1794 _GDATA_PAGE_SIZE = 50
1795 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1796 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1797 IE_NAME = u'youtube:user'
1799 def __init__(self, downloader=None):
1800 InfoExtractor.__init__(self, downloader)
1802 def report_download_page(self, username, start_index):
1803 """Report attempt to download user page."""
1804 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1805 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1807 def _real_extract(self, url):
1809 mobj = re.match(self._VALID_URL, url)
1811 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1814 username = mobj.group(1)
1816 # Download video ids using YouTube Data API. Result size per
1817 # query is limited (currently to 50 videos) so we need to query
1818 # page by page until there are no video ids - it means we got
1825 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1826 self.report_download_page(username, start_index)
1828 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1831 page = compat_urllib_request.urlopen(request).read()
1832 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1833 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1836 # Extract video identifiers
1839 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1840 if mobj.group(1) not in ids_in_page:
1841 ids_in_page.append(mobj.group(1))
1843 video_ids.extend(ids_in_page)
1845 # A little optimization - if current page is not
1846 # "full", ie. does not contain PAGE_SIZE video ids then
1847 # we can assume that this page is the last one - there
1848 # are no more ids on further pages - no need to query
1851 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1856 all_ids_count = len(video_ids)
1857 playliststart = self._downloader.params.get('playliststart', 1) - 1
1858 playlistend = self._downloader.params.get('playlistend', -1)
1860 if playlistend == -1:
1861 video_ids = video_ids[playliststart:]
1863 video_ids = video_ids[playliststart:playlistend]
1865 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1866 (username, all_ids_count, len(video_ids)))
1868 for video_id in video_ids:
1869 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1872 class BlipTVUserIE(InfoExtractor):
1873 """Information Extractor for blip.tv users."""
1875 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1877 IE_NAME = u'blip.tv:user'
1879 def __init__(self, downloader=None):
1880 InfoExtractor.__init__(self, downloader)
1882 def report_download_page(self, username, pagenum):
1883 """Report attempt to download user page."""
1884 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1885 (self.IE_NAME, username, pagenum))
1887 def _real_extract(self, url):
1889 mobj = re.match(self._VALID_URL, url)
1891 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1894 username = mobj.group(1)
1896 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1898 request = compat_urllib_request.Request(url)
1901 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1902 mobj = re.search(r'data-users-id="([^"]+)"', page)
1903 page_base = page_base % mobj.group(1)
1904 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1905 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1909 # Download video ids using BlipTV Ajax calls. Result size per
1910 # query is limited (currently to 12 videos) so we need to query
1911 # page by page until there are no video ids - it means we got
1918 self.report_download_page(username, pagenum)
1920 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1923 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1924 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1925 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1928 # Extract video identifiers
1931 for mobj in re.finditer(r'href="/([^"]+)"', page):
1932 if mobj.group(1) not in ids_in_page:
1933 ids_in_page.append(unescapeHTML(mobj.group(1)))
1935 video_ids.extend(ids_in_page)
1937 # A little optimization - if current page is not
1938 # "full", ie. does not contain PAGE_SIZE video ids then
1939 # we can assume that this page is the last one - there
1940 # are no more ids on further pages - no need to query
1943 if len(ids_in_page) < self._PAGE_SIZE:
1948 all_ids_count = len(video_ids)
1949 playliststart = self._downloader.params.get('playliststart', 1) - 1
1950 playlistend = self._downloader.params.get('playlistend', -1)
1952 if playlistend == -1:
1953 video_ids = video_ids[playliststart:]
1955 video_ids = video_ids[playliststart:playlistend]
1957 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1958 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1960 for video_id in video_ids:
1961 self._downloader.download([u'http://blip.tv/'+video_id])
1964 class DepositFilesIE(InfoExtractor):
1965 """Information extractor for depositfiles.com"""
1967 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1968 IE_NAME = u'DepositFiles'
1970 def __init__(self, downloader=None):
1971 InfoExtractor.__init__(self, downloader)
1973 def report_download_webpage(self, file_id):
1974 """Report webpage download."""
1975 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1977 def report_extraction(self, file_id):
1978 """Report information extraction."""
1979 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1981 def _real_extract(self, url):
1982 file_id = url.split('/')[-1]
1983 # Rebuild url in english locale
1984 url = 'http://depositfiles.com/en/files/' + file_id
1986 # Retrieve file webpage with 'Free download' button pressed
1987 free_download_indication = { 'gateway_result' : '1' }
1988 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1990 self.report_download_webpage(file_id)
1991 webpage = compat_urllib_request.urlopen(request).read()
1992 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1993 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1996 # Search for the real file URL
1997 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1998 if (mobj is None) or (mobj.group(1) is None):
1999 # Try to figure out reason of the error.
2000 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2001 if (mobj is not None) and (mobj.group(1) is not None):
2002 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2003 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2005 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2008 file_url = mobj.group(1)
2009 file_extension = os.path.splitext(file_url)[1][1:]
2011 # Search for file title
2012 mobj = re.search(r'<b title="(.*?)">', webpage)
2014 self._downloader.trouble(u'ERROR: unable to extract title')
2016 file_title = mobj.group(1).decode('utf-8')
2019 'id': file_id.decode('utf-8'),
2020 'url': file_url.decode('utf-8'),
2022 'upload_date': None,
2023 'title': file_title,
2024 'ext': file_extension.decode('utf-8'),
2028 class FacebookIE(InfoExtractor):
2029 """Information Extractor for Facebook"""
2032 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2033 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2034 _NETRC_MACHINE = 'facebook'
2035 _available_formats = ['video', 'highqual', 'lowqual']
2036 _video_extensions = {
2041 IE_NAME = u'facebook'
2043 def __init__(self, downloader=None):
2044 InfoExtractor.__init__(self, downloader)
2046 def _reporter(self, message):
2047 """Add header and report message."""
2048 self._downloader.to_screen(u'[facebook] %s' % message)
2050 def report_login(self):
2051 """Report attempt to log in."""
2052 self._reporter(u'Logging in')
2054 def report_video_webpage_download(self, video_id):
2055 """Report attempt to download video webpage."""
2056 self._reporter(u'%s: Downloading video webpage' % video_id)
2058 def report_information_extraction(self, video_id):
2059 """Report attempt to extract video information."""
2060 self._reporter(u'%s: Extracting video information' % video_id)
2062 def _parse_page(self, video_webpage):
2063 """Extract video information from page"""
2065 data = {'title': r'\("video_title", "(.*?)"\)',
2066 'description': r'<div class="datawrap">(.*?)</div>',
2067 'owner': r'\("video_owner_name", "(.*?)"\)',
2068 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2071 for piece in data.keys():
2072 mobj = re.search(data[piece], video_webpage)
2073 if mobj is not None:
2074 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2078 for fmt in self._available_formats:
2079 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2080 if mobj is not None:
2081 # URL is in a Javascript segment inside an escaped Unicode format within
2082 # the generally utf-8 page
2083 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2084 video_info['video_urls'] = video_urls
2088 def _real_initialize(self):
2089 if self._downloader is None:
2094 downloader_params = self._downloader.params
2096 # Attempt to use provided username and password or .netrc data
2097 if downloader_params.get('username', None) is not None:
2098 useremail = downloader_params['username']
2099 password = downloader_params['password']
2100 elif downloader_params.get('usenetrc', False):
2102 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2103 if info is not None:
2107 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2108 except (IOError, netrc.NetrcParseError) as err:
2109 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2112 if useremail is None:
2121 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2124 login_results = compat_urllib_request.urlopen(request).read()
2125 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2126 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2128 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2129 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2132 def _real_extract(self, url):
2133 mobj = re.match(self._VALID_URL, url)
2135 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2137 video_id = mobj.group('ID')
2140 self.report_video_webpage_download(video_id)
2141 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2143 page = compat_urllib_request.urlopen(request)
2144 video_webpage = page.read()
2145 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2146 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2149 # Start extracting information
2150 self.report_information_extraction(video_id)
2152 # Extract information
2153 video_info = self._parse_page(video_webpage)
2156 if 'owner' not in video_info:
2157 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2159 video_uploader = video_info['owner']
2162 if 'title' not in video_info:
2163 self._downloader.trouble(u'ERROR: unable to extract video title')
2165 video_title = video_info['title']
2166 video_title = video_title.decode('utf-8')
2169 if 'thumbnail' not in video_info:
2170 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2171 video_thumbnail = ''
2173 video_thumbnail = video_info['thumbnail']
2177 if 'upload_date' in video_info:
2178 upload_time = video_info['upload_date']
2179 timetuple = email.utils.parsedate_tz(upload_time)
2180 if timetuple is not None:
2182 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2187 video_description = video_info.get('description', 'No description available.')
2189 url_map = video_info['video_urls']
2190 if len(url_map.keys()) > 0:
2191 # Decide which formats to download
2192 req_format = self._downloader.params.get('format', None)
2193 format_limit = self._downloader.params.get('format_limit', None)
2195 if format_limit is not None and format_limit in self._available_formats:
2196 format_list = self._available_formats[self._available_formats.index(format_limit):]
2198 format_list = self._available_formats
2199 existing_formats = [x for x in format_list if x in url_map]
2200 if len(existing_formats) == 0:
2201 self._downloader.trouble(u'ERROR: no known formats available for video')
2203 if req_format is None:
2204 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2205 elif req_format == 'worst':
2206 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2207 elif req_format == '-1':
2208 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2211 if req_format not in url_map:
2212 self._downloader.trouble(u'ERROR: requested format not available')
2214 video_url_list = [(req_format, url_map[req_format])] # Specific format
2217 for format_param, video_real_url in video_url_list:
2219 video_extension = self._video_extensions.get(format_param, 'mp4')
2222 'id': video_id.decode('utf-8'),
2223 'url': video_real_url.decode('utf-8'),
2224 'uploader': video_uploader.decode('utf-8'),
2225 'upload_date': upload_date,
2226 'title': video_title,
2227 'ext': video_extension.decode('utf-8'),
2228 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2229 'thumbnail': video_thumbnail.decode('utf-8'),
2230 'description': video_description.decode('utf-8'),
2234 class BlipTVIE(InfoExtractor):
2235 """Information extractor for blip.tv"""
2237 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2238 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2239 IE_NAME = u'blip.tv'
2241 def report_extraction(self, file_id):
2242 """Report information extraction."""
2243 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2245 def report_direct_download(self, title):
2246 """Report information extraction."""
2247 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2249 def _real_extract(self, url):
2250 mobj = re.match(self._VALID_URL, url)
2252 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2259 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2260 request = compat_urllib_request.Request(json_url.encode('utf-8'))
2261 self.report_extraction(mobj.group(1))
2264 urlh = compat_urllib_request.urlopen(request)
2265 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2266 basename = url.split('/')[-1]
2267 title,ext = os.path.splitext(basename)
2268 title = title.decode('UTF-8')
2269 ext = ext.replace('.', '')
2270 self.report_direct_download(title)
2275 'upload_date': None,
2280 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2281 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2283 if info is None: # Regular URL
2285 json_code = urlh.read()
2286 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2287 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2291 json_data = json.loads(json_code)
2292 if 'Post' in json_data:
2293 data = json_data['Post']
2297 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2298 video_url = data['media']['url']
2299 umobj = re.match(self._URL_EXT, video_url)
2301 raise ValueError('Can not determine filename extension')
2302 ext = umobj.group(1)
2305 'id': data['item_id'],
2307 'uploader': data['display_name'],
2308 'upload_date': upload_date,
2309 'title': data['title'],
2311 'format': data['media']['mimeType'],
2312 'thumbnail': data['thumbnailUrl'],
2313 'description': data['description'],
2314 'player_url': data['embedUrl']
2316 except (ValueError,KeyError) as err:
2317 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2320 std_headers['User-Agent'] = 'iTunes/10.6.1'
2324 class MyVideoIE(InfoExtractor):
2325 """Information Extractor for myvideo.de."""
2327 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2328 IE_NAME = u'myvideo'
2330 def __init__(self, downloader=None):
2331 InfoExtractor.__init__(self, downloader)
2333 def report_download_webpage(self, video_id):
2334 """Report webpage download."""
2335 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2337 def report_extraction(self, video_id):
2338 """Report information extraction."""
2339 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2341 def _real_extract(self,url):
2342 mobj = re.match(self._VALID_URL, url)
2344 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2347 video_id = mobj.group(1)
2350 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2352 self.report_download_webpage(video_id)
2353 webpage = compat_urllib_request.urlopen(request).read()
2354 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2355 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2358 self.report_extraction(video_id)
2359 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2362 self._downloader.trouble(u'ERROR: unable to extract media URL')
2364 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2366 mobj = re.search('<title>([^<]+)</title>', webpage)
2368 self._downloader.trouble(u'ERROR: unable to extract title')
2371 video_title = mobj.group(1)
2377 'upload_date': None,
2378 'title': video_title,
2382 class ComedyCentralIE(InfoExtractor):
2383 """Information extractor for The Daily Show and Colbert Report """
2385 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2386 IE_NAME = u'comedycentral'
2388 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2390 _video_extensions = {
2398 _video_dimensions = {
2407 def report_extraction(self, episode_id):
2408 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2410 def report_config_download(self, episode_id):
2411 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2413 def report_index_download(self, episode_id):
2414 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2416 def report_player_url(self, episode_id):
2417 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2420 def _print_formats(self, formats):
2421 print('Available formats:')
2423 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2426 def _real_extract(self, url):
2427 mobj = re.match(self._VALID_URL, url)
2429 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2432 if mobj.group('shortname'):
2433 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2434 url = u'http://www.thedailyshow.com/full-episodes/'
2436 url = u'http://www.colbertnation.com/full-episodes/'
2437 mobj = re.match(self._VALID_URL, url)
2438 assert mobj is not None
2440 dlNewest = not mobj.group('episode')
2442 epTitle = mobj.group('showname')
2444 epTitle = mobj.group('episode')
2446 req = compat_urllib_request.Request(url)
2447 self.report_extraction(epTitle)
2449 htmlHandle = compat_urllib_request.urlopen(req)
2450 html = htmlHandle.read()
2451 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2455 url = htmlHandle.geturl()
2456 mobj = re.match(self._VALID_URL, url)
2458 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2460 if mobj.group('episode') == '':
2461 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2463 epTitle = mobj.group('episode')
2465 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2467 if len(mMovieParams) == 0:
2468 # The Colbert Report embeds the information in a without
2469 # a URL prefix; so extract the alternate reference
2470 # and then add the URL prefix manually.
2472 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2473 if len(altMovieParams) == 0:
2474 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2477 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2479 playerUrl_raw = mMovieParams[0][0]
2480 self.report_player_url(epTitle)
2482 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2483 playerUrl = urlHandle.geturl()
2484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2488 uri = mMovieParams[0][1]
2489 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2490 self.report_index_download(epTitle)
2492 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2493 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2494 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2499 idoc = xml.etree.ElementTree.fromstring(indexXml)
2500 itemEls = idoc.findall('.//item')
2501 for itemEl in itemEls:
2502 mediaId = itemEl.findall('./guid')[0].text
2503 shortMediaId = mediaId.split(':')[-1]
2504 showId = mediaId.split(':')[-2].replace('.com', '')
2505 officialTitle = itemEl.findall('./title')[0].text
2506 officialDate = itemEl.findall('./pubDate')[0].text
2508 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2509 compat_urllib_parse.urlencode({'uri': mediaId}))
2510 configReq = compat_urllib_request.Request(configUrl)
2511 self.report_config_download(epTitle)
2513 configXml = compat_urllib_request.urlopen(configReq).read()
2514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2515 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2518 cdoc = xml.etree.ElementTree.fromstring(configXml)
2520 for rendition in cdoc.findall('.//rendition'):
2521 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2525 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2528 if self._downloader.params.get('listformats', None):
2529 self._print_formats([i[0] for i in turls])
2532 # For now, just pick the highest bitrate
2533 format,video_url = turls[-1]
2535 # Get the format arg from the arg stream
2536 req_format = self._downloader.params.get('format', None)
2538 # Select format if we can find one
2541 format, video_url = f, v
2544 # Patch to download from alternative CDN, which does not
2545 # break on current RTMPDump builds
2546 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2547 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2549 if video_url.startswith(broken_cdn):
2550 video_url = video_url.replace(broken_cdn, better_cdn)
2552 effTitle = showId + u'-' + epTitle
2557 'upload_date': officialDate,
2562 'description': officialTitle,
2563 'player_url': None #playerUrl
2566 results.append(info)
2571 class EscapistIE(InfoExtractor):
2572 """Information extractor for The Escapist """
2574 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2575 IE_NAME = u'escapist'
2577 def report_extraction(self, showName):
2578 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2580 def report_config_download(self, showName):
2581 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2583 def _real_extract(self, url):
2584 mobj = re.match(self._VALID_URL, url)
2586 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2588 showName = mobj.group('showname')
2589 videoId = mobj.group('episode')
2591 self.report_extraction(showName)
2593 webPage = compat_urllib_request.urlopen(url)
2594 webPageBytes = webPage.read()
2595 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2596 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2597 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2598 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2601 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2602 description = unescapeHTML(descMatch.group(1))
2603 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2604 imgUrl = unescapeHTML(imgMatch.group(1))
2605 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2606 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2607 configUrlMatch = re.search('config=(.*)$', playerUrl)
2608 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2610 self.report_config_download(showName)
2612 configJSON = compat_urllib_request.urlopen(configUrl).read()
2613 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2614 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2617 # Technically, it's JavaScript, not JSON
2618 configJSON = configJSON.replace("'", '"')
2621 config = json.loads(configJSON)
2622 except (ValueError,) as err:
2623 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2626 playlist = config['playlist']
2627 videoUrl = playlist[1]['url']
2632 'uploader': showName,
2633 'upload_date': None,
2636 'thumbnail': imgUrl,
2637 'description': description,
2638 'player_url': playerUrl,
2644 class CollegeHumorIE(InfoExtractor):
2645 """Information extractor for collegehumor.com"""
2647 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2648 IE_NAME = u'collegehumor'
2650 def report_webpage(self, video_id):
2651 """Report information extraction."""
2652 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2654 def report_extraction(self, video_id):
2655 """Report information extraction."""
2656 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2658 def _real_extract(self, url):
2659 mobj = re.match(self._VALID_URL, url)
2661 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2663 video_id = mobj.group('videoid')
2665 self.report_webpage(video_id)
2666 request = compat_urllib_request.Request(url)
2668 webpage = compat_urllib_request.urlopen(request).read()
2669 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2670 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2673 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2675 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2677 internal_video_id = m.group('internalvideoid')
2681 'internal_id': internal_video_id,
2683 'upload_date': None,
2686 self.report_extraction(video_id)
2687 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2689 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2690 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2691 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2694 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2696 videoNode = mdoc.findall('./video')[0]
2697 info['description'] = videoNode.findall('./description')[0].text
2698 info['title'] = videoNode.findall('./caption')[0].text
2699 info['url'] = videoNode.findall('./file')[0].text
2700 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2701 info['ext'] = info['url'].rpartition('.')[2]
2703 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2709 class XVideosIE(InfoExtractor):
2710 """Information extractor for xvideos.com"""
2712 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2713 IE_NAME = u'xvideos'
2715 def report_webpage(self, video_id):
2716 """Report information extraction."""
2717 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2719 def report_extraction(self, video_id):
2720 """Report information extraction."""
2721 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2723 def _real_extract(self, url):
2724 mobj = re.match(self._VALID_URL, url)
2726 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2728 video_id = mobj.group(1).decode('utf-8')
2730 self.report_webpage(video_id)
2732 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2734 webpage = compat_urllib_request.urlopen(request).read()
2735 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2736 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2739 self.report_extraction(video_id)
2743 mobj = re.search(r'flv_url=(.+?)&', webpage)
2745 self._downloader.trouble(u'ERROR: unable to extract video url')
2747 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2751 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2753 self._downloader.trouble(u'ERROR: unable to extract video title')
2755 video_title = mobj.group(1).decode('utf-8')
2758 # Extract video thumbnail
2759 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2761 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2763 video_thumbnail = mobj.group(0).decode('utf-8')
2769 'upload_date': None,
2770 'title': video_title,
2772 'thumbnail': video_thumbnail,
2773 'description': None,
2779 class SoundcloudIE(InfoExtractor):
2780 """Information extractor for soundcloud.com
2781 To access the media, the uid of the song and a stream token
2782 must be extracted from the page source and the script must make
2783 a request to media.soundcloud.com/crossdomain.xml. Then
2784 the media can be grabbed by requesting from an url composed
2785 of the stream token and uid
2788 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2789 IE_NAME = u'soundcloud'
2791 def __init__(self, downloader=None):
2792 InfoExtractor.__init__(self, downloader)
2794 def report_webpage(self, video_id):
2795 """Report information extraction."""
2796 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2798 def report_extraction(self, video_id):
2799 """Report information extraction."""
2800 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2802 def _real_extract(self, url):
2803 mobj = re.match(self._VALID_URL, url)
2805 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2808 # extract uploader (which is in the url)
2809 uploader = mobj.group(1).decode('utf-8')
2810 # extract simple title (uploader + slug of song title)
2811 slug_title = mobj.group(2).decode('utf-8')
2812 simple_title = uploader + u'-' + slug_title
2814 self.report_webpage('%s/%s' % (uploader, slug_title))
2816 request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2818 webpage = compat_urllib_request.urlopen(request).read()
2819 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2820 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2823 self.report_extraction('%s/%s' % (uploader, slug_title))
2825 # extract uid and stream token that soundcloud hands out for access
2826 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2828 video_id = mobj.group(1)
2829 stream_token = mobj.group(2)
2831 # extract unsimplified title
2832 mobj = re.search('"title":"(.*?)",', webpage)
2834 title = mobj.group(1).decode('utf-8')
2836 title = simple_title
2838 # construct media url (with uid/token)
2839 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2840 mediaURL = mediaURL % (video_id, stream_token)
2843 description = u'No description available'
2844 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2846 description = mobj.group(1)
2850 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2853 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2854 except Exception, e:
2855 self._downloader.to_stderr(compat_str(e))
2857 # for soundcloud, a request to a cross domain is required for cookies
2858 request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2861 'id': video_id.decode('utf-8'),
2863 'uploader': uploader.decode('utf-8'),
2864 'upload_date': upload_date,
2867 'description': description.decode('utf-8')
2871 class InfoQIE(InfoExtractor):
2872 """Information extractor for infoq.com"""
2874 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2877 def report_webpage(self, video_id):
2878 """Report information extraction."""
2879 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2881 def report_extraction(self, video_id):
2882 """Report information extraction."""
2883 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2885 def _real_extract(self, url):
2886 mobj = re.match(self._VALID_URL, url)
2888 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2891 self.report_webpage(url)
2893 request = compat_urllib_request.Request(url)
2895 webpage = compat_urllib_request.urlopen(request).read()
2896 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2897 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2900 self.report_extraction(url)
2904 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2906 self._downloader.trouble(u'ERROR: unable to extract video url')
2908 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2912 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2914 self._downloader.trouble(u'ERROR: unable to extract video title')
2916 video_title = mobj.group(1).decode('utf-8')
2918 # Extract description
2919 video_description = u'No description available.'
2920 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2921 if mobj is not None:
2922 video_description = mobj.group(1).decode('utf-8')
2924 video_filename = video_url.split('/')[-1]
2925 video_id, extension = video_filename.split('.')
2931 'upload_date': None,
2932 'title': video_title,
2933 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2935 'description': video_description,
2940 class MixcloudIE(InfoExtractor):
2941 """Information extractor for www.mixcloud.com"""
2942 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2943 IE_NAME = u'mixcloud'
2945 def __init__(self, downloader=None):
2946 InfoExtractor.__init__(self, downloader)
2948 def report_download_json(self, file_id):
2949 """Report JSON download."""
2950 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2952 def report_extraction(self, file_id):
2953 """Report information extraction."""
2954 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2956 def get_urls(self, jsonData, fmt, bitrate='best'):
2957 """Get urls from 'audio_formats' section in json"""
2960 bitrate_list = jsonData[fmt]
2961 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2962 bitrate = max(bitrate_list) # select highest
2964 url_list = jsonData[fmt][bitrate]
2965 except TypeError: # we have no bitrate info.
2966 url_list = jsonData[fmt]
2969 def check_urls(self, url_list):
2970 """Returns 1st active url from list"""
2971 for url in url_list:
2973 compat_urllib_request.urlopen(url)
2975 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2980 def _print_formats(self, formats):
2981 print('Available formats:')
2982 for fmt in formats.keys():
2983 for b in formats[fmt]:
2985 ext = formats[fmt][b][0]
2986 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2987 except TypeError: # we have no bitrate info
2988 ext = formats[fmt][0]
2989 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2992 def _real_extract(self, url):
2993 mobj = re.match(self._VALID_URL, url)
2995 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2997 # extract uploader & filename from url
2998 uploader = mobj.group(1).decode('utf-8')
2999 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3001 # construct API request
3002 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3003 # retrieve .json file with links to files
3004 request = compat_urllib_request.Request(file_url)
3006 self.report_download_json(file_url)
3007 jsonData = compat_urllib_request.urlopen(request).read()
3008 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3009 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3013 json_data = json.loads(jsonData)
3014 player_url = json_data['player_swf_url']
3015 formats = dict(json_data['audio_formats'])
3017 req_format = self._downloader.params.get('format', None)
3020 if self._downloader.params.get('listformats', None):
3021 self._print_formats(formats)
3024 if req_format is None or req_format == 'best':
3025 for format_param in formats.keys():
3026 url_list = self.get_urls(formats, format_param)
3028 file_url = self.check_urls(url_list)
3029 if file_url is not None:
3032 if req_format not in formats.keys():
3033 self._downloader.trouble(u'ERROR: format is not available')
3036 url_list = self.get_urls(formats, req_format)
3037 file_url = self.check_urls(url_list)
3038 format_param = req_format
3041 'id': file_id.decode('utf-8'),
3042 'url': file_url.decode('utf-8'),
3043 'uploader': uploader.decode('utf-8'),
3044 'upload_date': None,
3045 'title': json_data['name'],
3046 'ext': file_url.split('.')[-1].decode('utf-8'),
3047 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3048 'thumbnail': json_data['thumbnail_url'],
3049 'description': json_data['description'],
3050 'player_url': player_url.decode('utf-8'),
3053 class StanfordOpenClassroomIE(InfoExtractor):
3054 """Information extractor for Stanford's Open ClassRoom"""
3056 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3057 IE_NAME = u'stanfordoc'
3059 def report_download_webpage(self, objid):
3060 """Report information extraction."""
3061 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3063 def report_extraction(self, video_id):
3064 """Report information extraction."""
3065 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3067 def _real_extract(self, url):
3068 mobj = re.match(self._VALID_URL, url)
3070 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3073 if mobj.group('course') and mobj.group('video'): # A specific video
3074 course = mobj.group('course')
3075 video = mobj.group('video')
3077 'id': course + '_' + video,
3079 'upload_date': None,
3082 self.report_extraction(info['id'])
3083 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3084 xmlUrl = baseUrl + video + '.xml'
3086 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3087 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3088 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3090 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3092 info['title'] = mdoc.findall('./title')[0].text
3093 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3095 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3097 info['ext'] = info['url'].rpartition('.')[2]
3099 elif mobj.group('course'): # A course page
3100 course = mobj.group('course')
3105 'upload_date': None,
3108 self.report_download_webpage(info['id'])
3110 coursepage = compat_urllib_request.urlopen(url).read()
3111 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3112 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3115 m = re.search('<h1>([^<]+)</h1>', coursepage)
3117 info['title'] = unescapeHTML(m.group(1))
3119 info['title'] = info['id']
3121 m = re.search('<description>([^<]+)</description>', coursepage)
3123 info['description'] = unescapeHTML(m.group(1))
3125 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3128 'type': 'reference',
3129 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3133 for entry in info['list']:
3134 assert entry['type'] == 'reference'
3135 results += self.extract(entry['url'])
3140 'id': 'Stanford OpenClassroom',
3143 'upload_date': None,
3146 self.report_download_webpage(info['id'])
3147 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3149 rootpage = compat_urllib_request.urlopen(rootURL).read()
3150 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3151 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3154 info['title'] = info['id']
3156 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3159 'type': 'reference',
3160 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3165 for entry in info['list']:
3166 assert entry['type'] == 'reference'
3167 results += self.extract(entry['url'])
3170 class MTVIE(InfoExtractor):
3171 """Information extractor for MTV.com"""
3173 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3176 def report_webpage(self, video_id):
3177 """Report information extraction."""
3178 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3180 def report_extraction(self, video_id):
3181 """Report information extraction."""
3182 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3184 def _real_extract(self, url):
3185 mobj = re.match(self._VALID_URL, url)
3187 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3189 if not mobj.group('proto'):
3190 url = 'http://' + url
3191 video_id = mobj.group('videoid')
3192 self.report_webpage(video_id)
3194 request = compat_urllib_request.Request(url)
3196 webpage = compat_urllib_request.urlopen(request).read()
3197 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3198 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3201 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3203 self._downloader.trouble(u'ERROR: unable to extract song name')
3205 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3206 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3208 self._downloader.trouble(u'ERROR: unable to extract performer')
3210 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3211 video_title = performer + ' - ' + song_name
3213 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3215 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3217 mtvn_uri = mobj.group(1)
3219 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3221 self._downloader.trouble(u'ERROR: unable to extract content id')
3223 content_id = mobj.group(1)
3225 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3226 self.report_extraction(video_id)
3227 request = compat_urllib_request.Request(videogen_url)
3229 metadataXml = compat_urllib_request.urlopen(request).read()
3230 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3231 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3234 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3235 renditions = mdoc.findall('.//rendition')
3237 # For now, always pick the highest quality.
3238 rendition = renditions[-1]
3241 _,_,ext = rendition.attrib['type'].partition('/')
3242 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3243 video_url = rendition.find('./src').text
3245 self._downloader.trouble('Invalid rendition field.')
3251 'uploader': performer,
3252 'upload_date': None,
3253 'title': video_title,
3261 class YoukuIE(InfoExtractor):
3263 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3266 def __init__(self, downloader=None):
3267 InfoExtractor.__init__(self, downloader)
3269 def report_download_webpage(self, file_id):
3270 """Report webpage download."""
3271 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3273 def report_extraction(self, file_id):
3274 """Report information extraction."""
3275 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3278 nowTime = int(time.time() * 1000)
3279 random1 = random.randint(1000,1998)
3280 random2 = random.randint(1000,9999)
3282 return "%d%d%d" %(nowTime,random1,random2)
3284 def _get_file_ID_mix_string(self, seed):
3286 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3288 for i in range(len(source)):
3289 seed = (seed * 211 + 30031 ) % 65536
3290 index = math.floor(seed / 65536 * len(source) )
3291 mixed.append(source[int(index)])
3292 source.remove(source[int(index)])
3293 #return ''.join(mixed)
3296 def _get_file_id(self, fileId, seed):
3297 mixed = self._get_file_ID_mix_string(seed)
3298 ids = fileId.split('*')
3302 realId.append(mixed[int(ch)])
3303 return ''.join(realId)
3305 def _real_extract(self, url):
3306 mobj = re.match(self._VALID_URL, url)
3308 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3310 video_id = mobj.group('ID')
3312 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3314 request = compat_urllib_request.Request(info_url, None, std_headers)
3316 self.report_download_webpage(video_id)
3317 jsondata = compat_urllib_request.urlopen(request).read()
3318 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3319 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3322 self.report_extraction(video_id)
3324 config = json.loads(jsondata)
3326 video_title = config['data'][0]['title']
3327 seed = config['data'][0]['seed']
3329 format = self._downloader.params.get('format', None)
3330 supported_format = config['data'][0]['streamfileids'].keys()
3332 if format is None or format == 'best':
3333 if 'hd2' in supported_format:
3338 elif format == 'worst':
3346 fileid = config['data'][0]['streamfileids'][format]
3347 seg_number = len(config['data'][0]['segs'][format])
3350 for i in xrange(seg_number):
3351 keys.append(config['data'][0]['segs'][format][i]['k'])
3354 #youku only could be viewed from mainland china
3356 self._downloader.trouble(u'ERROR: unable to extract info section')
3360 sid = self._gen_sid()
3361 fileid = self._get_file_id(fileid, seed)
3363 #column 8,9 of fileid represent the segment number
3364 #fileid[7:9] should be changed
3365 for index, key in enumerate(keys):
3367 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3368 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3371 'id': '%s_part%02d' % (video_id, index),
3372 'url': download_url,
3374 'upload_date': None,
3375 'title': video_title,
3378 files_info.append(info)
3383 class XNXXIE(InfoExtractor):
3384 """Information extractor for xnxx.com"""
3386 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3388 VIDEO_URL_RE = r'flv_url=(.*?)&'
3389 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3390 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3392 def report_webpage(self, video_id):
3393 """Report information extraction"""
3394 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3396 def report_extraction(self, video_id):
3397 """Report information extraction"""
3398 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3400 def _real_extract(self, url):
3401 mobj = re.match(self._VALID_URL, url)
3403 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3405 video_id = mobj.group(1).decode('utf-8')
3407 self.report_webpage(video_id)
3409 # Get webpage content
3411 webpage = compat_urllib_request.urlopen(url).read()
3412 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3413 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3416 result = re.search(self.VIDEO_URL_RE, webpage)
3418 self._downloader.trouble(u'ERROR: unable to extract video url')
3420 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3422 result = re.search(self.VIDEO_TITLE_RE, webpage)
3424 self._downloader.trouble(u'ERROR: unable to extract video title')
3426 video_title = result.group(1).decode('utf-8')
3428 result = re.search(self.VIDEO_THUMB_RE, webpage)
3430 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3432 video_thumbnail = result.group(1).decode('utf-8')
3438 'upload_date': None,
3439 'title': video_title,
3441 'thumbnail': video_thumbnail,
3442 'description': None,
3446 class GooglePlusIE(InfoExtractor):
3447 """Information extractor for plus.google.com."""
3449 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3450 IE_NAME = u'plus.google'
3452 def __init__(self, downloader=None):
3453 InfoExtractor.__init__(self, downloader)
3455 def report_extract_entry(self, url):
3456 """Report downloading extry"""
3457 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3459 def report_date(self, upload_date):
3460 """Report downloading extry"""
3461 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3463 def report_uploader(self, uploader):
3464 """Report downloading extry"""
3465 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3467 def report_title(self, video_title):
3468 """Report downloading extry"""
3469 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3471 def report_extract_vid_page(self, video_page):
3472 """Report information extraction."""
3473 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3475 def _real_extract(self, url):
3476 # Extract id from URL
3477 mobj = re.match(self._VALID_URL, url)
3479 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3482 post_url = mobj.group(0)
3483 video_id = mobj.group(2)
3485 video_extension = 'flv'
3487 # Step 1, Retrieve post webpage to extract further information
3488 self.report_extract_entry(post_url)
3489 request = compat_urllib_request.Request(post_url)
3491 webpage = compat_urllib_request.urlopen(request).read()
3492 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3493 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3496 # Extract update date
3498 pattern = 'title="Timestamp">(.*?)</a>'
3499 mobj = re.search(pattern, webpage)
3501 upload_date = mobj.group(1)
3502 # Convert timestring to a format suitable for filename
3503 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3504 upload_date = upload_date.strftime('%Y%m%d')
3505 self.report_date(upload_date)
3509 pattern = r'rel\="author".*?>(.*?)</a>'
3510 mobj = re.search(pattern, webpage)
3512 uploader = mobj.group(1)
3513 self.report_uploader(uploader)
3516 # Get the first line for title
3518 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3519 mobj = re.search(pattern, webpage)
3521 video_title = mobj.group(1)
3522 self.report_title(video_title)
3524 # Step 2, Stimulate clicking the image box to launch video
3525 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3526 mobj = re.search(pattern, webpage)
3528 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3530 video_page = mobj.group(1)
3531 request = compat_urllib_request.Request(video_page)
3533 webpage = compat_urllib_request.urlopen(request).read()
3534 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3535 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3537 self.report_extract_vid_page(video_page)
3540 # Extract video links on video page
3541 """Extract video links of all sizes"""
3542 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3543 mobj = re.findall(pattern, webpage)
3545 self._downloader.trouble(u'ERROR: unable to extract video links')
3547 # Sort in resolution
3548 links = sorted(mobj)
3550 # Choose the lowest of the sort, i.e. highest resolution
3551 video_url = links[-1]
3552 # Only get the url. The resolution part in the tuple has no use anymore
3553 video_url = video_url[-1]
3554 # Treat escaped \u0026 style hex
3555 video_url = unicode(video_url, "unicode_escape")
3559 'id': video_id.decode('utf-8'),
3561 'uploader': uploader.decode('utf-8'),
3562 'upload_date': upload_date.decode('utf-8'),
3563 'title': video_title.decode('utf-8'),
3564 'ext': video_extension.decode('utf-8'),