2 # -*- coding: utf-8 -*-
11 import xml.etree.ElementTree
18 class InfoExtractor(object):
19 """Information Extractor class.
21 Information extractors are the classes that, given a URL, extract
22 information about the video (or videos) the URL refers to. This
23 information includes the real video URL, the video title, author and
24 others. The information is stored in a dictionary which is then
25 passed to the FileDownloader. The FileDownloader processes this
26 information possibly downloading the video to the file system, among
27 other possible outcomes.
29 The dictionaries must include the following fields:
33 uploader: Nickname of the video uploader, unescaped.
34 upload_date: Video upload date (YYYYMMDD).
35 title: Video title, unescaped.
36 ext: Video filename extension.
38 The following fields are optional:
40 format: The video format, defaults to ext (used for --get-format)
41 thumbnail: Full URL to a video thumbnail image.
42 description: One-line video description.
43 player_url: SWF Player URL (used for rtmpdump).
44 subtitles: The .srt file contents.
45 urlhandle: [internal] The urlHandle to be used to download the file,
46 like returned by urllib.request.urlopen
48 The fields should all be Unicode strings.
50 Subclasses of this one should re-define the _real_initialize() and
51 _real_extract() methods and define a _VALID_URL regexp.
52 Probably, they should also be added to the list of extractors.
54 _real_extract() must return a *list* of information dictionaries as
57 Finally, the _WORKING attribute should be set to False for broken IEs
58 in order to warn the users and skip the tests.
65 def __init__(self, downloader=None):
66 """Constructor. Receives an optional downloader."""
68 self.set_downloader(downloader)
70 def suitable(self, url):
71 """Receives a URL and returns True if suitable for this IE."""
72 return re.match(self._VALID_URL, url) is not None
75 """Getter method for _WORKING."""
79 """Initializes an instance (authentication, etc)."""
81 self._real_initialize()
84 def extract(self, url):
85 """Extracts URL information and returns it in list of dicts."""
87 return self._real_extract(url)
89 def set_downloader(self, downloader):
90 """Sets the downloader for this IE."""
91 self._downloader = downloader
93 def _real_initialize(self):
94 """Real initialization process. Redefine in subclasses."""
97 def _real_extract(self, url):
98 """Real extraction process. Redefine in subclasses."""
102 class YoutubeIE(InfoExtractor):
103 """Information extractor for youtube.com."""
107 (?:https?://)? # http(s):// (optional)
108 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
109 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
110 (?:.*?\#/)? # handle anchor (#/) redirect urls
111 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
112 (?: # the various things that can precede the ID:
113 (?:(?:v|embed|e)/) # v/ or embed/ or e/
114 |(?: # or the v= param in all its forms
115 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
116 (?:\?|\#!?) # the params delimiter ? or # or #!
117 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
120 )? # optional -> youtube.com/xxxx is OK
121 )? # all until now is optional -> you can pass the naked ID
122 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
123 (?(1).+)? # if we found the ID, everything can follow
125 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
126 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
127 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
128 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
129 _NETRC_MACHINE = 'youtube'
130 # Listed in order of quality
131 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
132 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
133 _video_extensions = {
139 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
145 _video_dimensions = {
163 def suitable(self, url):
164 """Receives a URL and returns True if suitable for this IE."""
165 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
167 def report_lang(self):
168 """Report attempt to set language."""
169 self._downloader.to_screen(u'[youtube] Setting language')
171 def report_login(self):
172 """Report attempt to log in."""
173 self._downloader.to_screen(u'[youtube] Logging in')
175 def report_age_confirmation(self):
176 """Report attempt to confirm age."""
177 self._downloader.to_screen(u'[youtube] Confirming age')
179 def report_video_webpage_download(self, video_id):
180 """Report attempt to download video webpage."""
181 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
183 def report_video_info_webpage_download(self, video_id):
184 """Report attempt to download video info webpage."""
185 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
187 def report_video_subtitles_download(self, video_id):
188 """Report attempt to download video info webpage."""
189 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
191 def report_information_extraction(self, video_id):
192 """Report attempt to extract video information."""
193 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
195 def report_unavailable_format(self, video_id, format):
196 """Report extracted video URL."""
197 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
199 def report_rtmp_download(self):
200 """Indicate the download will use the RTMP protocol."""
201 self._downloader.to_screen(u'[youtube] RTMP download detected')
203 def _closed_captions_xml_to_srt(self, xml_string):
205 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
206 # TODO parse xml instead of regex
207 for n, (start, dur_tag, dur, caption) in enumerate(texts):
208 if not dur: dur = '4'
210 end = start + float(dur)
211 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
212 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
213 caption = unescapeHTML(caption)
214 caption = unescapeHTML(caption) # double cycle, intentional
215 srt += str(n+1) + '\n'
216 srt += start + ' --> ' + end + '\n'
217 srt += caption + '\n\n'
220 def _print_formats(self, formats):
221 print('Available formats:')
223 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
225 def _real_initialize(self):
226 if self._downloader is None:
231 downloader_params = self._downloader.params
233 # Attempt to use provided username and password or .netrc data
234 if downloader_params.get('username', None) is not None:
235 username = downloader_params['username']
236 password = downloader_params['password']
237 elif downloader_params.get('usenetrc', False):
239 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
244 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
245 except (IOError, netrc.NetrcParseError) as err:
246 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
250 request = compat_urllib_request.Request(self._LANG_URL)
253 compat_urllib_request.urlopen(request).read()
254 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
255 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
258 # No authentication to be performed
264 'current_form': 'loginForm',
266 'action_login': 'Log In',
267 'username': username,
268 'password': password,
270 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
273 login_results = compat_urllib_request.urlopen(request).read()
274 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
275 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
277 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
278 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
284 'action_confirm': 'Confirm',
286 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
288 self.report_age_confirmation()
289 age_results = compat_urllib_request.urlopen(request).read()
290 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
291 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
294 def _real_extract(self, url):
295 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
296 mobj = re.search(self._NEXT_URL_RE, url)
298 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
300 # Extract video id from URL
301 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
303 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
305 video_id = mobj.group(2)
308 self.report_video_webpage_download(video_id)
309 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
311 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
312 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
313 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
316 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
318 # Attempt to extract SWF player URL
319 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
321 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
326 self.report_video_info_webpage_download(video_id)
327 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
328 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
329 % (video_id, el_type))
330 request = compat_urllib_request.Request(video_info_url)
332 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
333 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
334 video_info = compat_parse_qs(video_info_webpage)
335 if 'token' in video_info:
337 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
338 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
340 if 'token' not in video_info:
341 if 'reason' in video_info:
342 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
344 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
347 # Check for "rental" videos
348 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
349 self._downloader.trouble(u'ERROR: "rental" videos not supported')
352 # Start extracting information
353 self.report_information_extraction(video_id)
356 if 'author' not in video_info:
357 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
359 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
362 if 'title' not in video_info:
363 self._downloader.trouble(u'ERROR: unable to extract video title')
365 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
368 if 'thumbnail_url' not in video_info:
369 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
371 else: # don't panic if we can't find it
372 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
376 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
378 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
379 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
380 for expression in format_expressions:
382 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
387 video_description = get_element_by_id("eow-description", video_webpage)
388 if video_description:
389 video_description = clean_html(video_description)
391 video_description = ''
394 video_subtitles = None
395 if self._downloader.params.get('writesubtitles', False):
397 self.report_video_subtitles_download(video_id)
398 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
400 srt_list = compat_urllib_request.urlopen(request).read()
401 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
402 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
403 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
404 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
405 if not srt_lang_list:
406 raise Trouble(u'WARNING: video has no closed captions')
407 if self._downloader.params.get('subtitleslang', False):
408 srt_lang = self._downloader.params.get('subtitleslang')
409 elif 'en' in srt_lang_list:
412 srt_lang = srt_lang_list.keys()[0]
413 if not srt_lang in srt_lang_list:
414 raise Trouble(u'WARNING: no closed captions found in the specified language')
415 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
417 srt_xml = compat_urllib_request.urlopen(request).read()
418 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
419 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
421 raise Trouble(u'WARNING: unable to download video subtitles')
422 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
423 except Trouble as trouble:
424 self._downloader.trouble(trouble[0])
426 if 'length_seconds' not in video_info:
427 self._downloader.trouble(u'WARNING: unable to extract video duration')
430 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
433 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
435 # Decide which formats to download
436 req_format = self._downloader.params.get('format', None)
438 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
439 self.report_rtmp_download()
440 video_url_list = [(None, video_info['conn'][0])]
441 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
442 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
443 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
444 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
445 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
447 format_limit = self._downloader.params.get('format_limit', None)
448 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
449 if format_limit is not None and format_limit in available_formats:
450 format_list = available_formats[available_formats.index(format_limit):]
452 format_list = available_formats
453 existing_formats = [x for x in format_list if x in url_map]
454 if len(existing_formats) == 0:
455 self._downloader.trouble(u'ERROR: no known formats available for video')
457 if self._downloader.params.get('listformats', None):
458 self._print_formats(existing_formats)
460 if req_format is None or req_format == 'best':
461 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
462 elif req_format == 'worst':
463 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
464 elif req_format in ('-1', 'all'):
465 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
467 # Specific formats. We pick the first in a slash-delimeted sequence.
468 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
469 req_formats = req_format.split('/')
470 video_url_list = None
471 for rf in req_formats:
473 video_url_list = [(rf, url_map[rf])]
475 if video_url_list is None:
476 self._downloader.trouble(u'ERROR: requested format not available')
479 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
483 for format_param, video_real_url in video_url_list:
485 video_extension = self._video_extensions.get(format_param, 'flv')
487 video_format = '{} - {}'.format(format_param if format_param else video_extension,
488 self._video_dimensions.get(format_param, '???'))
492 'url': video_real_url,
493 'uploader': video_uploader,
494 'upload_date': upload_date,
495 'title': video_title,
496 'ext': video_extension,
497 'format': video_format,
498 'thumbnail': video_thumbnail,
499 'description': video_description,
500 'player_url': player_url,
501 'subtitles': video_subtitles,
502 'duration': video_duration
507 class MetacafeIE(InfoExtractor):
508 """Information Extractor for metacafe.com."""
510 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
511 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
512 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
513 IE_NAME = u'metacafe'
515 def __init__(self, downloader=None):
516 InfoExtractor.__init__(self, downloader)
518 def report_disclaimer(self):
519 """Report disclaimer retrieval."""
520 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
522 def report_age_confirmation(self):
523 """Report attempt to confirm age."""
524 self._downloader.to_screen(u'[metacafe] Confirming age')
526 def report_download_webpage(self, video_id):
527 """Report webpage download."""
528 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
530 def report_extraction(self, video_id):
531 """Report information extraction."""
532 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
534 def _real_initialize(self):
535 # Retrieve disclaimer
536 request = compat_urllib_request.Request(self._DISCLAIMER)
538 self.report_disclaimer()
539 disclaimer = compat_urllib_request.urlopen(request).read()
540 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
541 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
547 'submit': "Continue - I'm over 18",
549 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
551 self.report_age_confirmation()
552 disclaimer = compat_urllib_request.urlopen(request).read()
553 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
554 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
557 def _real_extract(self, url):
558 # Extract id and simplified title from URL
559 mobj = re.match(self._VALID_URL, url)
561 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
564 video_id = mobj.group(1)
566 # Check if video comes from YouTube
567 mobj2 = re.match(r'^yt-(.*)$', video_id)
568 if mobj2 is not None:
569 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
572 # Retrieve video webpage to extract further information
573 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
575 self.report_download_webpage(video_id)
576 webpage = compat_urllib_request.urlopen(request).read()
577 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
578 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
581 # Extract URL, uploader and title from webpage
582 self.report_extraction(video_id)
583 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
585 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
586 video_extension = mediaURL[-3:]
588 # Extract gdaKey if available
589 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
593 gdaKey = mobj.group(1)
594 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
596 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
598 self._downloader.trouble(u'ERROR: unable to extract media URL')
600 vardict = compat_parse_qs(mobj.group(1))
601 if 'mediaData' not in vardict:
602 self._downloader.trouble(u'ERROR: unable to extract media URL')
604 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
606 self._downloader.trouble(u'ERROR: unable to extract media URL')
608 mediaURL = mobj.group(1).replace('\\/', '/')
609 video_extension = mediaURL[-3:]
610 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
612 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
614 self._downloader.trouble(u'ERROR: unable to extract title')
616 video_title = mobj.group(1).decode('utf-8')
618 mobj = re.search(r'submitter=(.*?);', webpage)
620 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
622 video_uploader = mobj.group(1)
625 'id': video_id.decode('utf-8'),
626 'url': video_url.decode('utf-8'),
627 'uploader': video_uploader.decode('utf-8'),
629 'title': video_title,
630 'ext': video_extension.decode('utf-8'),
634 class DailymotionIE(InfoExtractor):
635 """Information Extractor for Dailymotion"""
637 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
638 IE_NAME = u'dailymotion'
640 def __init__(self, downloader=None):
641 InfoExtractor.__init__(self, downloader)
643 def report_download_webpage(self, video_id):
644 """Report webpage download."""
645 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
647 def report_extraction(self, video_id):
648 """Report information extraction."""
649 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
651 def _real_extract(self, url):
652 # Extract id and simplified title from URL
653 mobj = re.match(self._VALID_URL, url)
655 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
658 video_id = mobj.group(1).split('_')[0].split('?')[0]
660 video_extension = 'mp4'
662 # Retrieve video webpage to extract further information
663 request = compat_urllib_request.Request(url)
664 request.add_header('Cookie', 'family_filter=off')
666 self.report_download_webpage(video_id)
667 webpage = compat_urllib_request.urlopen(request).read()
668 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
669 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
672 # Extract URL, uploader and title from webpage
673 self.report_extraction(video_id)
674 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
676 self._downloader.trouble(u'ERROR: unable to extract media URL')
678 flashvars = compat_urllib_parse.unquote(mobj.group(1))
680 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
683 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
686 self._downloader.trouble(u'ERROR: unable to extract video URL')
689 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
691 self._downloader.trouble(u'ERROR: unable to extract video URL')
694 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
696 # TODO: support choosing qualities
698 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
700 self._downloader.trouble(u'ERROR: unable to extract title')
702 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
704 video_uploader = None
705 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
707 # lookin for official user
708 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
709 if mobj_official is None:
710 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
712 video_uploader = mobj_official.group(1)
714 video_uploader = mobj.group(1)
716 video_upload_date = None
717 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
719 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
722 'id': video_id.decode('utf-8'),
723 'url': video_url.decode('utf-8'),
724 'uploader': video_uploader.decode('utf-8'),
725 'upload_date': video_upload_date,
726 'title': video_title,
727 'ext': video_extension.decode('utf-8'),
731 class GoogleIE(InfoExtractor):
732 """Information extractor for video.google.com."""
734 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
735 IE_NAME = u'video.google'
737 def __init__(self, downloader=None):
738 InfoExtractor.__init__(self, downloader)
740 def report_download_webpage(self, video_id):
741 """Report webpage download."""
742 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
744 def report_extraction(self, video_id):
745 """Report information extraction."""
746 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
748 def _real_extract(self, url):
749 # Extract id from URL
750 mobj = re.match(self._VALID_URL, url)
752 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
755 video_id = mobj.group(1)
757 video_extension = 'mp4'
759 # Retrieve video webpage to extract further information
760 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
762 self.report_download_webpage(video_id)
763 webpage = compat_urllib_request.urlopen(request).read()
764 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
765 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
768 # Extract URL, uploader, and title from webpage
769 self.report_extraction(video_id)
770 mobj = re.search(r"download_url:'([^']+)'", webpage)
772 video_extension = 'flv'
773 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
775 self._downloader.trouble(u'ERROR: unable to extract media URL')
777 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
778 mediaURL = mediaURL.replace('\\x3d', '\x3d')
779 mediaURL = mediaURL.replace('\\x26', '\x26')
783 mobj = re.search(r'<title>(.*)</title>', webpage)
785 self._downloader.trouble(u'ERROR: unable to extract title')
787 video_title = mobj.group(1).decode('utf-8')
789 # Extract video description
790 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
792 self._downloader.trouble(u'ERROR: unable to extract video description')
794 video_description = mobj.group(1).decode('utf-8')
795 if not video_description:
796 video_description = 'No description available.'
798 # Extract video thumbnail
799 if self._downloader.params.get('forcethumbnail', False):
800 request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
802 webpage = compat_urllib_request.urlopen(request).read()
803 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
804 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
806 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
808 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
810 video_thumbnail = mobj.group(1)
811 else: # we need something to pass to process_info
815 'id': video_id.decode('utf-8'),
816 'url': video_url.decode('utf-8'),
819 'title': video_title,
820 'ext': video_extension.decode('utf-8'),
824 class PhotobucketIE(InfoExtractor):
825 """Information extractor for photobucket.com."""
827 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
828 IE_NAME = u'photobucket'
830 def __init__(self, downloader=None):
831 InfoExtractor.__init__(self, downloader)
833 def report_download_webpage(self, video_id):
834 """Report webpage download."""
835 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
837 def report_extraction(self, video_id):
838 """Report information extraction."""
839 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
841 def _real_extract(self, url):
842 # Extract id from URL
843 mobj = re.match(self._VALID_URL, url)
845 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
848 video_id = mobj.group(1)
850 video_extension = 'flv'
852 # Retrieve video webpage to extract further information
853 request = compat_urllib_request.Request(url)
855 self.report_download_webpage(video_id)
856 webpage = compat_urllib_request.urlopen(request).read()
857 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
858 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
861 # Extract URL, uploader, and title from webpage
862 self.report_extraction(video_id)
863 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
865 self._downloader.trouble(u'ERROR: unable to extract media URL')
867 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
871 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
873 self._downloader.trouble(u'ERROR: unable to extract title')
875 video_title = mobj.group(1).decode('utf-8')
877 video_uploader = mobj.group(2).decode('utf-8')
880 'id': video_id.decode('utf-8'),
881 'url': video_url.decode('utf-8'),
882 'uploader': video_uploader,
884 'title': video_title,
885 'ext': video_extension.decode('utf-8'),
889 class YahooIE(InfoExtractor):
890 """Information extractor for video.yahoo.com."""
892 # _VALID_URL matches all Yahoo! Video URLs
893 # _VPAGE_URL matches only the extractable '/watch/' URLs
894 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
895 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
896 IE_NAME = u'video.yahoo'
898 def __init__(self, downloader=None):
899 InfoExtractor.__init__(self, downloader)
901 def report_download_webpage(self, video_id):
902 """Report webpage download."""
903 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
905 def report_extraction(self, video_id):
906 """Report information extraction."""
907 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
909 def _real_extract(self, url, new_video=True):
910 # Extract ID from URL
911 mobj = re.match(self._VALID_URL, url)
913 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
916 video_id = mobj.group(2)
917 video_extension = 'flv'
919 # Rewrite valid but non-extractable URLs as
920 # extractable English language /watch/ URLs
921 if re.match(self._VPAGE_URL, url) is None:
922 request = compat_urllib_request.Request(url)
924 webpage = compat_urllib_request.urlopen(request).read()
925 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
926 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
929 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
931 self._downloader.trouble(u'ERROR: Unable to extract id field')
933 yahoo_id = mobj.group(1)
935 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
937 self._downloader.trouble(u'ERROR: Unable to extract vid field')
939 yahoo_vid = mobj.group(1)
941 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
942 return self._real_extract(url, new_video=False)
944 # Retrieve video webpage to extract further information
945 request = compat_urllib_request.Request(url)
947 self.report_download_webpage(video_id)
948 webpage = compat_urllib_request.urlopen(request).read()
949 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
950 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
953 # Extract uploader and title from webpage
954 self.report_extraction(video_id)
955 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
957 self._downloader.trouble(u'ERROR: unable to extract video title')
959 video_title = mobj.group(1).decode('utf-8')
961 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
963 self._downloader.trouble(u'ERROR: unable to extract video uploader')
965 video_uploader = mobj.group(1).decode('utf-8')
967 # Extract video thumbnail
968 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
970 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
972 video_thumbnail = mobj.group(1).decode('utf-8')
974 # Extract video description
975 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
977 self._downloader.trouble(u'ERROR: unable to extract video description')
979 video_description = mobj.group(1).decode('utf-8')
980 if not video_description:
981 video_description = 'No description available.'
983 # Extract video height and width
984 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
986 self._downloader.trouble(u'ERROR: unable to extract video height')
988 yv_video_height = mobj.group(1)
990 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
992 self._downloader.trouble(u'ERROR: unable to extract video width')
994 yv_video_width = mobj.group(1)
996 # Retrieve video playlist to extract media URL
997 # I'm not completely sure what all these options are, but we
998 # seem to need most of them, otherwise the server sends a 401.
999 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1000 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1001 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1002 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1003 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1005 self.report_download_webpage(video_id)
1006 webpage = compat_urllib_request.urlopen(request).read()
1007 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1008 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1011 # Extract media URL from playlist XML
1012 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1014 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1016 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1017 video_url = unescapeHTML(video_url)
1020 'id': video_id.decode('utf-8'),
1022 'uploader': video_uploader,
1023 'upload_date': None,
1024 'title': video_title,
1025 'ext': video_extension.decode('utf-8'),
1026 'thumbnail': video_thumbnail.decode('utf-8'),
1027 'description': video_description,
1031 class VimeoIE(InfoExtractor):
1032 """Information extractor for vimeo.com."""
1034 # _VALID_URL matches Vimeo URLs
1035 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1038 def __init__(self, downloader=None):
1039 InfoExtractor.__init__(self, downloader)
1041 def report_download_webpage(self, video_id):
1042 """Report webpage download."""
1043 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1045 def report_extraction(self, video_id):
1046 """Report information extraction."""
1047 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1049 def _real_extract(self, url, new_video=True):
1050 # Extract ID from URL
1051 mobj = re.match(self._VALID_URL, url)
1053 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1056 video_id = mobj.group(1)
1058 # Retrieve video webpage to extract further information
1059 request = compat_urllib_request.Request(url, None, std_headers)
1061 self.report_download_webpage(video_id)
1062 webpage = compat_urllib_request.urlopen(request).read()
1063 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1067 # Now we begin extracting as much information as we can from what we
1068 # retrieved. First we extract the information common to all extractors,
1069 # and latter we extract those that are Vimeo specific.
1070 self.report_extraction(video_id)
1072 # Extract the config JSON
1073 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1075 config = json.loads(config)
1077 self._downloader.trouble(u'ERROR: unable to extract info section')
1081 video_title = config["video"]["title"]
1084 video_uploader = config["video"]["owner"]["name"]
1086 # Extract video thumbnail
1087 video_thumbnail = config["video"]["thumbnail"]
1089 # Extract video description
1090 video_description = get_element_by_id("description", webpage.decode('utf8'))
1091 if video_description: video_description = clean_html(video_description)
1092 else: video_description = ''
1094 # Extract upload date
1095 video_upload_date = None
1096 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1097 if mobj is not None:
1098 video_upload_date = mobj.group(1)
1100 # Vimeo specific: extract request signature and timestamp
1101 sig = config['request']['signature']
1102 timestamp = config['request']['timestamp']
1104 # Vimeo specific: extract video codec and quality information
1105 # First consider quality, then codecs, then take everything
1106 # TODO bind to format param
1107 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1108 files = { 'hd': [], 'sd': [], 'other': []}
1109 for codec_name, codec_extension in codecs:
1110 if codec_name in config["video"]["files"]:
1111 if 'hd' in config["video"]["files"][codec_name]:
1112 files['hd'].append((codec_name, codec_extension, 'hd'))
1113 elif 'sd' in config["video"]["files"][codec_name]:
1114 files['sd'].append((codec_name, codec_extension, 'sd'))
1116 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1118 for quality in ('hd', 'sd', 'other'):
1119 if len(files[quality]) > 0:
1120 video_quality = files[quality][0][2]
1121 video_codec = files[quality][0][0]
1122 video_extension = files[quality][0][1]
1123 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1126 self._downloader.trouble(u'ERROR: no known codec found')
1129 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1130 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1135 'uploader': video_uploader,
1136 'upload_date': video_upload_date,
1137 'title': video_title,
1138 'ext': video_extension,
1139 'thumbnail': video_thumbnail,
1140 'description': video_description,
1144 class ArteTvIE(InfoExtractor):
1145 """arte.tv information extractor."""
1147 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1148 _LIVE_URL = r'index-[0-9]+\.html$'
1150 IE_NAME = u'arte.tv'
1152 def __init__(self, downloader=None):
1153 InfoExtractor.__init__(self, downloader)
1155 def report_download_webpage(self, video_id):
1156 """Report webpage download."""
1157 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1159 def report_extraction(self, video_id):
1160 """Report information extraction."""
1161 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1163 def fetch_webpage(self, url):
1164 self._downloader.increment_downloads()
1165 request = compat_urllib_request.Request(url)
1167 self.report_download_webpage(url)
1168 webpage = compat_urllib_request.urlopen(request).read()
1169 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1170 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1172 except ValueError as err:
1173 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1177 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1178 page = self.fetch_webpage(url)
1179 mobj = re.search(regex, page, regexFlags)
1183 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1186 for (i, key, err) in matchTuples:
1187 if mobj.group(i) is None:
1188 self._downloader.trouble(err)
1191 info[key] = mobj.group(i)
1195 def extractLiveStream(self, url):
1196 video_lang = url.split('/')[-4]
1197 info = self.grep_webpage(
1199 r'src="(.*?/videothek_js.*?\.js)',
1202 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1205 http_host = url.split('/')[2]
1206 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1207 info = self.grep_webpage(
1209 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1210 '(http://.*?\.swf).*?' +
1214 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1215 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1216 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1219 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1221 def extractPlus7Stream(self, url):
1222 video_lang = url.split('/')[-3]
1223 info = self.grep_webpage(
1225 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1228 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1231 next_url = compat_urllib_parse.unquote(info.get('url'))
1232 info = self.grep_webpage(
1234 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1237 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1240 next_url = compat_urllib_parse.unquote(info.get('url'))
1242 info = self.grep_webpage(
1244 r'<video id="(.*?)".*?>.*?' +
1245 '<name>(.*?)</name>.*?' +
1246 '<dateVideo>(.*?)</dateVideo>.*?' +
1247 '<url quality="hd">(.*?)</url>',
1250 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1251 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1252 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1253 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1258 'id': info.get('id'),
1259 'url': compat_urllib_parse.unquote(info.get('url')),
1260 'uploader': u'arte.tv',
1261 'upload_date': info.get('date'),
1262 'title': info.get('title'),
1268 def _real_extract(self, url):
1269 video_id = url.split('/')[-1]
1270 self.report_extraction(video_id)
1272 if re.search(self._LIVE_URL, video_id) is not None:
1273 self.extractLiveStream(url)
1276 info = self.extractPlus7Stream(url)
1281 class GenericIE(InfoExtractor):
1282 """Generic last-resort information extractor."""
1285 IE_NAME = u'generic'
1287 def __init__(self, downloader=None):
1288 InfoExtractor.__init__(self, downloader)
1290 def report_download_webpage(self, video_id):
1291 """Report webpage download."""
1292 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1293 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1295 def report_extraction(self, video_id):
1296 """Report information extraction."""
1297 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1299 def report_following_redirect(self, new_url):
1300 """Report information extraction."""
1301 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1303 def _test_redirect(self, url):
1304 """Check if it is a redirect, like url shorteners, in case restart chain."""
1305 class HeadRequest(compat_urllib_request.Request):
1306 def get_method(self):
1309 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1311 Subclass the HTTPRedirectHandler to make it use our
1312 HeadRequest also on the redirected URL
1314 def redirect_request(self, req, fp, code, msg, headers, newurl):
1315 if code in (301, 302, 303, 307):
1316 newurl = newurl.replace(' ', '%20')
1317 newheaders = dict((k,v) for k,v in req.headers.items()
1318 if k.lower() not in ("content-length", "content-type"))
1319 return HeadRequest(newurl,
1321 origin_req_host=req.get_origin_req_host(),
1324 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1326 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1328 Fallback to GET if HEAD is not allowed (405 HTTP error)
1330 def http_error_405(self, req, fp, code, msg, headers):
1334 newheaders = dict((k,v) for k,v in req.headers.items()
1335 if k.lower() not in ("content-length", "content-type"))
1336 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1338 origin_req_host=req.get_origin_req_host(),
1342 opener = compat_urllib_request.OpenerDirector()
1343 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1344 HTTPMethodFallback, HEADRedirectHandler,
1345 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1346 opener.add_handler(handler())
1348 response = opener.open(HeadRequest(url))
1349 new_url = response.geturl()
1354 self.report_following_redirect(new_url)
1355 self._downloader.download([new_url])
1358 def _real_extract(self, url):
1359 if self._test_redirect(url): return
1361 video_id = url.split('/')[-1]
1362 request = compat_urllib_request.Request(url)
1364 self.report_download_webpage(video_id)
1365 webpage = compat_urllib_request.urlopen(request).read()
1366 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1367 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1369 except ValueError as err:
1370 # since this is the last-resort InfoExtractor, if
1371 # this error is thrown, it'll be thrown here
1372 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1375 self.report_extraction(video_id)
1376 # Start with something easy: JW Player in SWFObject
1377 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1379 # Broaden the search a little bit
1380 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1382 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1385 # It's possible that one of the regexes
1386 # matched, but returned an empty group:
1387 if mobj.group(1) is None:
1388 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1391 video_url = compat_urllib_parse.unquote(mobj.group(1))
1392 video_id = os.path.basename(video_url)
1394 # here's a fun little line of code for you:
1395 video_extension = os.path.splitext(video_id)[1][1:]
1396 video_id = os.path.splitext(video_id)[0]
1398 # it's tempting to parse this further, but you would
1399 # have to take into account all the variations like
1400 # Video Title - Site Name
1401 # Site Name | Video Title
1402 # Video Title - Tagline | Site Name
1403 # and so on and so forth; it's just not practical
1404 mobj = re.search(r'<title>(.*)</title>', webpage)
1406 self._downloader.trouble(u'ERROR: unable to extract title')
1408 video_title = mobj.group(1).decode('utf-8')
1410 # video uploader is domain name
1411 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1413 self._downloader.trouble(u'ERROR: unable to extract title')
1415 video_uploader = mobj.group(1).decode('utf-8')
1418 'id': video_id.decode('utf-8'),
1419 'url': video_url.decode('utf-8'),
1420 'uploader': video_uploader,
1421 'upload_date': None,
1422 'title': video_title,
1423 'ext': video_extension.decode('utf-8'),
1427 class YoutubeSearchIE(InfoExtractor):
1428 """Information Extractor for YouTube search queries."""
1429 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1430 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1431 _max_youtube_results = 1000
1432 IE_NAME = u'youtube:search'
1434 def __init__(self, downloader=None):
1435 InfoExtractor.__init__(self, downloader)
1437 def report_download_page(self, query, pagenum):
1438 """Report attempt to download search page with given number."""
1439 query = query.decode(preferredencoding())
1440 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1442 def _real_extract(self, query):
1443 mobj = re.match(self._VALID_URL, query)
1445 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1448 prefix, query = query.split(':')
1450 query = query.encode('utf-8')
1452 self._download_n_results(query, 1)
1454 elif prefix == 'all':
1455 self._download_n_results(query, self._max_youtube_results)
1461 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1463 elif n > self._max_youtube_results:
1464 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1465 n = self._max_youtube_results
1466 self._download_n_results(query, n)
1468 except ValueError: # parsing prefix as integer fails
1469 self._download_n_results(query, 1)
1472 def _download_n_results(self, query, n):
1473 """Downloads a specified number of results for a query"""
1479 while (50 * pagenum) < limit:
1480 self.report_download_page(query, pagenum+1)
1481 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1482 request = compat_urllib_request.Request(result_url)
1484 data = compat_urllib_request.urlopen(request).read()
1485 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1486 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1488 api_response = json.loads(data)['data']
1490 new_ids = list(video['id'] for video in api_response['items'])
1491 video_ids += new_ids
1493 limit = min(n, api_response['totalItems'])
1496 if len(video_ids) > n:
1497 video_ids = video_ids[:n]
1498 for id in video_ids:
1499 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1503 class GoogleSearchIE(InfoExtractor):
1504 """Information Extractor for Google Video search queries."""
1505 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1506 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1507 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1508 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1509 _max_google_results = 1000
1510 IE_NAME = u'video.google:search'
1512 def __init__(self, downloader=None):
1513 InfoExtractor.__init__(self, downloader)
1515 def report_download_page(self, query, pagenum):
1516 """Report attempt to download playlist page with given number."""
1517 query = query.decode(preferredencoding())
1518 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1520 def _real_extract(self, query):
1521 mobj = re.match(self._VALID_URL, query)
1523 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1526 prefix, query = query.split(':')
1528 query = query.encode('utf-8')
1530 self._download_n_results(query, 1)
1532 elif prefix == 'all':
1533 self._download_n_results(query, self._max_google_results)
1539 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1541 elif n > self._max_google_results:
1542 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1543 n = self._max_google_results
1544 self._download_n_results(query, n)
1546 except ValueError: # parsing prefix as integer fails
1547 self._download_n_results(query, 1)
1550 def _download_n_results(self, query, n):
1551 """Downloads a specified number of results for a query"""
1557 self.report_download_page(query, pagenum)
1558 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1559 request = compat_urllib_request.Request(result_url)
1561 page = compat_urllib_request.urlopen(request).read()
1562 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1563 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1566 # Extract video identifiers
1567 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1568 video_id = mobj.group(1)
1569 if video_id not in video_ids:
1570 video_ids.append(video_id)
1571 if len(video_ids) == n:
1572 # Specified n videos reached
1573 for id in video_ids:
1574 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1577 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1578 for id in video_ids:
1579 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1582 pagenum = pagenum + 1
1585 class YahooSearchIE(InfoExtractor):
1586 """Information Extractor for Yahoo! Video search queries."""
1587 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1588 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1589 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1590 _MORE_PAGES_INDICATOR = r'\s*Next'
1591 _max_yahoo_results = 1000
1592 IE_NAME = u'video.yahoo:search'
1594 def __init__(self, downloader=None):
1595 InfoExtractor.__init__(self, downloader)
1597 def report_download_page(self, query, pagenum):
1598 """Report attempt to download playlist page with given number."""
1599 query = query.decode(preferredencoding())
1600 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1602 def _real_extract(self, query):
1603 mobj = re.match(self._VALID_URL, query)
1605 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1608 prefix, query = query.split(':')
1610 query = query.encode('utf-8')
1612 self._download_n_results(query, 1)
1614 elif prefix == 'all':
1615 self._download_n_results(query, self._max_yahoo_results)
1621 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1623 elif n > self._max_yahoo_results:
1624 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1625 n = self._max_yahoo_results
1626 self._download_n_results(query, n)
1628 except ValueError: # parsing prefix as integer fails
1629 self._download_n_results(query, 1)
1632 def _download_n_results(self, query, n):
1633 """Downloads a specified number of results for a query"""
1636 already_seen = set()
1640 self.report_download_page(query, pagenum)
1641 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1642 request = compat_urllib_request.Request(result_url)
1644 page = compat_urllib_request.urlopen(request).read()
1645 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1646 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1649 # Extract video identifiers
1650 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1651 video_id = mobj.group(1)
1652 if video_id not in already_seen:
1653 video_ids.append(video_id)
1654 already_seen.add(video_id)
1655 if len(video_ids) == n:
1656 # Specified n videos reached
1657 for id in video_ids:
1658 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1661 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1662 for id in video_ids:
1663 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1666 pagenum = pagenum + 1
1669 class YoutubePlaylistIE(InfoExtractor):
1670 """Information Extractor for YouTube playlists."""
1672 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1673 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1674 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1675 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1676 IE_NAME = u'youtube:playlist'
1678 def __init__(self, downloader=None):
1679 InfoExtractor.__init__(self, downloader)
1681 def report_download_page(self, playlist_id, pagenum):
1682 """Report attempt to download playlist page with given number."""
1683 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1685 def _real_extract(self, url):
1686 # Extract playlist id
1687 mobj = re.match(self._VALID_URL, url)
1689 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1693 if mobj.group(3) is not None:
1694 self._downloader.download([mobj.group(3)])
1697 # Download playlist pages
1698 # prefix is 'p' as default for playlists but there are other types that need extra care
1699 playlist_prefix = mobj.group(1)
1700 if playlist_prefix == 'a':
1701 playlist_access = 'artist'
1703 playlist_prefix = 'p'
1704 playlist_access = 'view_play_list'
1705 playlist_id = mobj.group(2)
1710 self.report_download_page(playlist_id, pagenum)
1711 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1712 request = compat_urllib_request.Request(url)
1714 page = compat_urllib_request.urlopen(request).read()
1715 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1716 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1719 # Extract video identifiers
1721 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1722 if mobj.group(1) not in ids_in_page:
1723 ids_in_page.append(mobj.group(1))
1724 video_ids.extend(ids_in_page)
1726 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1728 pagenum = pagenum + 1
1730 playliststart = self._downloader.params.get('playliststart', 1) - 1
1731 playlistend = self._downloader.params.get('playlistend', -1)
1732 if playlistend == -1:
1733 video_ids = video_ids[playliststart:]
1735 video_ids = video_ids[playliststart:playlistend]
1737 for id in video_ids:
1738 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1742 class YoutubeChannelIE(InfoExtractor):
1743 """Information Extractor for YouTube channels."""
1745 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1746 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1747 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1748 IE_NAME = u'youtube:channel'
1750 def report_download_page(self, channel_id, pagenum):
1751 """Report attempt to download channel page with given number."""
1752 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1754 def _real_extract(self, url):
1755 # Extract channel id
1756 mobj = re.match(self._VALID_URL, url)
1758 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1761 # Download channel pages
1762 channel_id = mobj.group(1)
1767 self.report_download_page(channel_id, pagenum)
1768 url = self._TEMPLATE_URL % (channel_id, pagenum)
1769 request = compat_urllib_request.Request(url)
1771 page = compat_urllib_request.urlopen(request).read()
1772 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1773 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1776 # Extract video identifiers
1778 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1779 if mobj.group(1) not in ids_in_page:
1780 ids_in_page.append(mobj.group(1))
1781 video_ids.extend(ids_in_page)
1783 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1785 pagenum = pagenum + 1
1787 for id in video_ids:
1788 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1792 class YoutubeUserIE(InfoExtractor):
1793 """Information Extractor for YouTube users."""
1795 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1796 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1797 _GDATA_PAGE_SIZE = 50
1798 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1799 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1800 IE_NAME = u'youtube:user'
1802 def __init__(self, downloader=None):
1803 InfoExtractor.__init__(self, downloader)
1805 def report_download_page(self, username, start_index):
1806 """Report attempt to download user page."""
1807 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1808 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1810 def _real_extract(self, url):
1812 mobj = re.match(self._VALID_URL, url)
1814 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1817 username = mobj.group(1)
1819 # Download video ids using YouTube Data API. Result size per
1820 # query is limited (currently to 50 videos) so we need to query
1821 # page by page until there are no video ids - it means we got
1828 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1829 self.report_download_page(username, start_index)
1831 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1834 page = compat_urllib_request.urlopen(request).read()
1835 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1836 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1839 # Extract video identifiers
1842 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1843 if mobj.group(1) not in ids_in_page:
1844 ids_in_page.append(mobj.group(1))
1846 video_ids.extend(ids_in_page)
1848 # A little optimization - if current page is not
1849 # "full", ie. does not contain PAGE_SIZE video ids then
1850 # we can assume that this page is the last one - there
1851 # are no more ids on further pages - no need to query
1854 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1859 all_ids_count = len(video_ids)
1860 playliststart = self._downloader.params.get('playliststart', 1) - 1
1861 playlistend = self._downloader.params.get('playlistend', -1)
1863 if playlistend == -1:
1864 video_ids = video_ids[playliststart:]
1866 video_ids = video_ids[playliststart:playlistend]
1868 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1869 (username, all_ids_count, len(video_ids)))
1871 for video_id in video_ids:
1872 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1875 class BlipTVUserIE(InfoExtractor):
1876 """Information Extractor for blip.tv users."""
1878 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1880 IE_NAME = u'blip.tv:user'
1882 def __init__(self, downloader=None):
1883 InfoExtractor.__init__(self, downloader)
1885 def report_download_page(self, username, pagenum):
1886 """Report attempt to download user page."""
1887 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1888 (self.IE_NAME, username, pagenum))
1890 def _real_extract(self, url):
1892 mobj = re.match(self._VALID_URL, url)
1894 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1897 username = mobj.group(1)
1899 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1901 request = compat_urllib_request.Request(url)
1904 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1905 mobj = re.search(r'data-users-id="([^"]+)"', page)
1906 page_base = page_base % mobj.group(1)
1907 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1908 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1912 # Download video ids using BlipTV Ajax calls. Result size per
1913 # query is limited (currently to 12 videos) so we need to query
1914 # page by page until there are no video ids - it means we got
1921 self.report_download_page(username, pagenum)
1923 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1926 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1927 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1928 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1931 # Extract video identifiers
1934 for mobj in re.finditer(r'href="/([^"]+)"', page):
1935 if mobj.group(1) not in ids_in_page:
1936 ids_in_page.append(unescapeHTML(mobj.group(1)))
1938 video_ids.extend(ids_in_page)
1940 # A little optimization - if current page is not
1941 # "full", ie. does not contain PAGE_SIZE video ids then
1942 # we can assume that this page is the last one - there
1943 # are no more ids on further pages - no need to query
1946 if len(ids_in_page) < self._PAGE_SIZE:
1951 all_ids_count = len(video_ids)
1952 playliststart = self._downloader.params.get('playliststart', 1) - 1
1953 playlistend = self._downloader.params.get('playlistend', -1)
1955 if playlistend == -1:
1956 video_ids = video_ids[playliststart:]
1958 video_ids = video_ids[playliststart:playlistend]
1960 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1961 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1963 for video_id in video_ids:
1964 self._downloader.download([u'http://blip.tv/'+video_id])
1967 class DepositFilesIE(InfoExtractor):
1968 """Information extractor for depositfiles.com"""
1970 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1971 IE_NAME = u'DepositFiles'
1973 def __init__(self, downloader=None):
1974 InfoExtractor.__init__(self, downloader)
1976 def report_download_webpage(self, file_id):
1977 """Report webpage download."""
1978 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1980 def report_extraction(self, file_id):
1981 """Report information extraction."""
1982 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1984 def _real_extract(self, url):
1985 file_id = url.split('/')[-1]
1986 # Rebuild url in english locale
1987 url = 'http://depositfiles.com/en/files/' + file_id
1989 # Retrieve file webpage with 'Free download' button pressed
1990 free_download_indication = { 'gateway_result' : '1' }
1991 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1993 self.report_download_webpage(file_id)
1994 webpage = compat_urllib_request.urlopen(request).read()
1995 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1996 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1999 # Search for the real file URL
2000 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2001 if (mobj is None) or (mobj.group(1) is None):
2002 # Try to figure out reason of the error.
2003 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2004 if (mobj is not None) and (mobj.group(1) is not None):
2005 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2006 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2008 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2011 file_url = mobj.group(1)
2012 file_extension = os.path.splitext(file_url)[1][1:]
2014 # Search for file title
2015 mobj = re.search(r'<b title="(.*?)">', webpage)
2017 self._downloader.trouble(u'ERROR: unable to extract title')
2019 file_title = mobj.group(1).decode('utf-8')
2022 'id': file_id.decode('utf-8'),
2023 'url': file_url.decode('utf-8'),
2025 'upload_date': None,
2026 'title': file_title,
2027 'ext': file_extension.decode('utf-8'),
2031 class FacebookIE(InfoExtractor):
2032 """Information Extractor for Facebook"""
2035 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2036 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2037 _NETRC_MACHINE = 'facebook'
2038 _available_formats = ['video', 'highqual', 'lowqual']
2039 _video_extensions = {
2044 IE_NAME = u'facebook'
2046 def __init__(self, downloader=None):
2047 InfoExtractor.__init__(self, downloader)
2049 def _reporter(self, message):
2050 """Add header and report message."""
2051 self._downloader.to_screen(u'[facebook] %s' % message)
2053 def report_login(self):
2054 """Report attempt to log in."""
2055 self._reporter(u'Logging in')
2057 def report_video_webpage_download(self, video_id):
2058 """Report attempt to download video webpage."""
2059 self._reporter(u'%s: Downloading video webpage' % video_id)
2061 def report_information_extraction(self, video_id):
2062 """Report attempt to extract video information."""
2063 self._reporter(u'%s: Extracting video information' % video_id)
2065 def _parse_page(self, video_webpage):
2066 """Extract video information from page"""
2068 data = {'title': r'\("video_title", "(.*?)"\)',
2069 'description': r'<div class="datawrap">(.*?)</div>',
2070 'owner': r'\("video_owner_name", "(.*?)"\)',
2071 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2074 for piece in data.keys():
2075 mobj = re.search(data[piece], video_webpage)
2076 if mobj is not None:
2077 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2081 for fmt in self._available_formats:
2082 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2083 if mobj is not None:
2084 # URL is in a Javascript segment inside an escaped Unicode format within
2085 # the generally utf-8 page
2086 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2087 video_info['video_urls'] = video_urls
2091 def _real_initialize(self):
2092 if self._downloader is None:
2097 downloader_params = self._downloader.params
2099 # Attempt to use provided username and password or .netrc data
2100 if downloader_params.get('username', None) is not None:
2101 useremail = downloader_params['username']
2102 password = downloader_params['password']
2103 elif downloader_params.get('usenetrc', False):
2105 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2106 if info is not None:
2110 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2111 except (IOError, netrc.NetrcParseError) as err:
2112 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2115 if useremail is None:
2124 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2127 login_results = compat_urllib_request.urlopen(request).read()
2128 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2129 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2132 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2135 def _real_extract(self, url):
2136 mobj = re.match(self._VALID_URL, url)
2138 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2140 video_id = mobj.group('ID')
2143 self.report_video_webpage_download(video_id)
2144 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2146 page = compat_urllib_request.urlopen(request)
2147 video_webpage = page.read()
2148 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2149 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2152 # Start extracting information
2153 self.report_information_extraction(video_id)
2155 # Extract information
2156 video_info = self._parse_page(video_webpage)
2159 if 'owner' not in video_info:
2160 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2162 video_uploader = video_info['owner']
2165 if 'title' not in video_info:
2166 self._downloader.trouble(u'ERROR: unable to extract video title')
2168 video_title = video_info['title']
2169 video_title = video_title.decode('utf-8')
2172 if 'thumbnail' not in video_info:
2173 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2174 video_thumbnail = ''
2176 video_thumbnail = video_info['thumbnail']
2180 if 'upload_date' in video_info:
2181 upload_time = video_info['upload_date']
2182 timetuple = email.utils.parsedate_tz(upload_time)
2183 if timetuple is not None:
2185 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2190 video_description = video_info.get('description', 'No description available.')
2192 url_map = video_info['video_urls']
2193 if len(url_map.keys()) > 0:
2194 # Decide which formats to download
2195 req_format = self._downloader.params.get('format', None)
2196 format_limit = self._downloader.params.get('format_limit', None)
2198 if format_limit is not None and format_limit in self._available_formats:
2199 format_list = self._available_formats[self._available_formats.index(format_limit):]
2201 format_list = self._available_formats
2202 existing_formats = [x for x in format_list if x in url_map]
2203 if len(existing_formats) == 0:
2204 self._downloader.trouble(u'ERROR: no known formats available for video')
2206 if req_format is None:
2207 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2208 elif req_format == 'worst':
2209 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2210 elif req_format == '-1':
2211 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2214 if req_format not in url_map:
2215 self._downloader.trouble(u'ERROR: requested format not available')
2217 video_url_list = [(req_format, url_map[req_format])] # Specific format
2220 for format_param, video_real_url in video_url_list:
2222 video_extension = self._video_extensions.get(format_param, 'mp4')
2225 'id': video_id.decode('utf-8'),
2226 'url': video_real_url.decode('utf-8'),
2227 'uploader': video_uploader.decode('utf-8'),
2228 'upload_date': upload_date,
2229 'title': video_title,
2230 'ext': video_extension.decode('utf-8'),
2231 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2232 'thumbnail': video_thumbnail.decode('utf-8'),
2233 'description': video_description.decode('utf-8'),
2237 class BlipTVIE(InfoExtractor):
2238 """Information extractor for blip.tv"""
2240 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2241 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2242 IE_NAME = u'blip.tv'
2244 def report_extraction(self, file_id):
2245 """Report information extraction."""
2246 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2248 def report_direct_download(self, title):
2249 """Report information extraction."""
2250 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2252 def _real_extract(self, url):
2253 mobj = re.match(self._VALID_URL, url)
2255 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2262 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2263 request = compat_urllib_request.Request(json_url.encode('utf-8'))
2264 self.report_extraction(mobj.group(1))
2267 urlh = compat_urllib_request.urlopen(request)
2268 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2269 basename = url.split('/')[-1]
2270 title,ext = os.path.splitext(basename)
2271 title = title.decode('UTF-8')
2272 ext = ext.replace('.', '')
2273 self.report_direct_download(title)
2278 'upload_date': None,
2283 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2284 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2286 if info is None: # Regular URL
2288 json_code = urlh.read()
2289 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2290 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2294 json_data = json.loads(json_code)
2295 if 'Post' in json_data:
2296 data = json_data['Post']
2300 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2301 video_url = data['media']['url']
2302 umobj = re.match(self._URL_EXT, video_url)
2304 raise ValueError('Can not determine filename extension')
2305 ext = umobj.group(1)
2308 'id': data['item_id'],
2310 'uploader': data['display_name'],
2311 'upload_date': upload_date,
2312 'title': data['title'],
2314 'format': data['media']['mimeType'],
2315 'thumbnail': data['thumbnailUrl'],
2316 'description': data['description'],
2317 'player_url': data['embedUrl']
2319 except (ValueError,KeyError) as err:
2320 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2323 std_headers['User-Agent'] = 'iTunes/10.6.1'
2327 class MyVideoIE(InfoExtractor):
2328 """Information Extractor for myvideo.de."""
2330 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2331 IE_NAME = u'myvideo'
2333 def __init__(self, downloader=None):
2334 InfoExtractor.__init__(self, downloader)
2336 def report_download_webpage(self, video_id):
2337 """Report webpage download."""
2338 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2340 def report_extraction(self, video_id):
2341 """Report information extraction."""
2342 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2344 def _real_extract(self,url):
2345 mobj = re.match(self._VALID_URL, url)
2347 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2350 video_id = mobj.group(1)
2353 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2355 self.report_download_webpage(video_id)
2356 webpage = compat_urllib_request.urlopen(request).read()
2357 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2358 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2361 self.report_extraction(video_id)
2362 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2365 self._downloader.trouble(u'ERROR: unable to extract media URL')
2367 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2369 mobj = re.search('<title>([^<]+)</title>', webpage)
2371 self._downloader.trouble(u'ERROR: unable to extract title')
2374 video_title = mobj.group(1)
2380 'upload_date': None,
2381 'title': video_title,
2385 class ComedyCentralIE(InfoExtractor):
2386 """Information extractor for The Daily Show and Colbert Report """
2388 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2389 IE_NAME = u'comedycentral'
2391 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2393 _video_extensions = {
2401 _video_dimensions = {
2410 def report_extraction(self, episode_id):
2411 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2413 def report_config_download(self, episode_id):
2414 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2416 def report_index_download(self, episode_id):
2417 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2419 def report_player_url(self, episode_id):
2420 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2423 def _print_formats(self, formats):
2424 print('Available formats:')
2426 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2429 def _real_extract(self, url):
2430 mobj = re.match(self._VALID_URL, url)
2432 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2435 if mobj.group('shortname'):
2436 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2437 url = u'http://www.thedailyshow.com/full-episodes/'
2439 url = u'http://www.colbertnation.com/full-episodes/'
2440 mobj = re.match(self._VALID_URL, url)
2441 assert mobj is not None
2443 dlNewest = not mobj.group('episode')
2445 epTitle = mobj.group('showname')
2447 epTitle = mobj.group('episode')
2449 req = compat_urllib_request.Request(url)
2450 self.report_extraction(epTitle)
2452 htmlHandle = compat_urllib_request.urlopen(req)
2453 html = htmlHandle.read()
2454 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2455 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2458 url = htmlHandle.geturl()
2459 mobj = re.match(self._VALID_URL, url)
2461 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2463 if mobj.group('episode') == '':
2464 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2466 epTitle = mobj.group('episode')
2468 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2470 if len(mMovieParams) == 0:
2471 # The Colbert Report embeds the information in a without
2472 # a URL prefix; so extract the alternate reference
2473 # and then add the URL prefix manually.
2475 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2476 if len(altMovieParams) == 0:
2477 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2480 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2482 playerUrl_raw = mMovieParams[0][0]
2483 self.report_player_url(epTitle)
2485 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2486 playerUrl = urlHandle.geturl()
2487 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2488 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2491 uri = mMovieParams[0][1]
2492 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2493 self.report_index_download(epTitle)
2495 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2496 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2497 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2502 idoc = xml.etree.ElementTree.fromstring(indexXml)
2503 itemEls = idoc.findall('.//item')
2504 for itemEl in itemEls:
2505 mediaId = itemEl.findall('./guid')[0].text
2506 shortMediaId = mediaId.split(':')[-1]
2507 showId = mediaId.split(':')[-2].replace('.com', '')
2508 officialTitle = itemEl.findall('./title')[0].text
2509 officialDate = itemEl.findall('./pubDate')[0].text
2511 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2512 compat_urllib_parse.urlencode({'uri': mediaId}))
2513 configReq = compat_urllib_request.Request(configUrl)
2514 self.report_config_download(epTitle)
2516 configXml = compat_urllib_request.urlopen(configReq).read()
2517 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2518 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2521 cdoc = xml.etree.ElementTree.fromstring(configXml)
2523 for rendition in cdoc.findall('.//rendition'):
2524 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2528 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2531 if self._downloader.params.get('listformats', None):
2532 self._print_formats([i[0] for i in turls])
2535 # For now, just pick the highest bitrate
2536 format,video_url = turls[-1]
2538 # Get the format arg from the arg stream
2539 req_format = self._downloader.params.get('format', None)
2541 # Select format if we can find one
2544 format, video_url = f, v
2547 # Patch to download from alternative CDN, which does not
2548 # break on current RTMPDump builds
2549 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2550 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2552 if video_url.startswith(broken_cdn):
2553 video_url = video_url.replace(broken_cdn, better_cdn)
2555 effTitle = showId + u'-' + epTitle
2560 'upload_date': officialDate,
2565 'description': officialTitle,
2566 'player_url': None #playerUrl
2569 results.append(info)
2574 class EscapistIE(InfoExtractor):
2575 """Information extractor for The Escapist """
2577 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2578 IE_NAME = u'escapist'
2580 def report_extraction(self, showName):
2581 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2583 def report_config_download(self, showName):
2584 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2586 def _real_extract(self, url):
2587 mobj = re.match(self._VALID_URL, url)
2589 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2591 showName = mobj.group('showname')
2592 videoId = mobj.group('episode')
2594 self.report_extraction(showName)
2596 webPage = compat_urllib_request.urlopen(url)
2597 webPageBytes = webPage.read()
2598 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2599 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2600 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2601 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2604 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2605 description = unescapeHTML(descMatch.group(1))
2606 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2607 imgUrl = unescapeHTML(imgMatch.group(1))
2608 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2609 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2610 configUrlMatch = re.search('config=(.*)$', playerUrl)
2611 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2613 self.report_config_download(showName)
2615 configJSON = compat_urllib_request.urlopen(configUrl).read()
2616 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2617 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2620 # Technically, it's JavaScript, not JSON
2621 configJSON = configJSON.replace("'", '"')
2624 config = json.loads(configJSON)
2625 except (ValueError,) as err:
2626 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2629 playlist = config['playlist']
2630 videoUrl = playlist[1]['url']
2635 'uploader': showName,
2636 'upload_date': None,
2639 'thumbnail': imgUrl,
2640 'description': description,
2641 'player_url': playerUrl,
2647 class CollegeHumorIE(InfoExtractor):
2648 """Information extractor for collegehumor.com"""
2650 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2651 IE_NAME = u'collegehumor'
2653 def report_webpage(self, video_id):
2654 """Report information extraction."""
2655 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2657 def report_extraction(self, video_id):
2658 """Report information extraction."""
2659 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2661 def _real_extract(self, url):
2662 mobj = re.match(self._VALID_URL, url)
2664 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2666 video_id = mobj.group('videoid')
2668 self.report_webpage(video_id)
2669 request = compat_urllib_request.Request(url)
2671 webpage = compat_urllib_request.urlopen(request).read()
2672 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2673 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2676 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2678 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2680 internal_video_id = m.group('internalvideoid')
2684 'internal_id': internal_video_id,
2686 'upload_date': None,
2689 self.report_extraction(video_id)
2690 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2692 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2693 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2694 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2697 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2699 videoNode = mdoc.findall('./video')[0]
2700 info['description'] = videoNode.findall('./description')[0].text
2701 info['title'] = videoNode.findall('./caption')[0].text
2702 info['url'] = videoNode.findall('./file')[0].text
2703 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2704 info['ext'] = info['url'].rpartition('.')[2]
2706 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2712 class XVideosIE(InfoExtractor):
2713 """Information extractor for xvideos.com"""
2715 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2716 IE_NAME = u'xvideos'
2718 def report_webpage(self, video_id):
2719 """Report information extraction."""
2720 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2722 def report_extraction(self, video_id):
2723 """Report information extraction."""
2724 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2726 def _real_extract(self, url):
2727 mobj = re.match(self._VALID_URL, url)
2729 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2731 video_id = mobj.group(1).decode('utf-8')
2733 self.report_webpage(video_id)
2735 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2737 webpage = compat_urllib_request.urlopen(request).read()
2738 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2739 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2742 self.report_extraction(video_id)
2746 mobj = re.search(r'flv_url=(.+?)&', webpage)
2748 self._downloader.trouble(u'ERROR: unable to extract video url')
2750 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2754 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2756 self._downloader.trouble(u'ERROR: unable to extract video title')
2758 video_title = mobj.group(1).decode('utf-8')
2761 # Extract video thumbnail
2762 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2764 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2766 video_thumbnail = mobj.group(0).decode('utf-8')
2772 'upload_date': None,
2773 'title': video_title,
2775 'thumbnail': video_thumbnail,
2776 'description': None,
2782 class SoundcloudIE(InfoExtractor):
2783 """Information extractor for soundcloud.com
2784 To access the media, the uid of the song and a stream token
2785 must be extracted from the page source and the script must make
2786 a request to media.soundcloud.com/crossdomain.xml. Then
2787 the media can be grabbed by requesting from an url composed
2788 of the stream token and uid
2791 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2792 IE_NAME = u'soundcloud'
2794 def __init__(self, downloader=None):
2795 InfoExtractor.__init__(self, downloader)
2797 def report_webpage(self, video_id):
2798 """Report information extraction."""
2799 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2801 def report_extraction(self, video_id):
2802 """Report information extraction."""
2803 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2805 def _real_extract(self, url):
2806 mobj = re.match(self._VALID_URL, url)
2808 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2811 # extract uploader (which is in the url)
2812 uploader = mobj.group(1).decode('utf-8')
2813 # extract simple title (uploader + slug of song title)
2814 slug_title = mobj.group(2).decode('utf-8')
2815 simple_title = uploader + u'-' + slug_title
2817 self.report_webpage('%s/%s' % (uploader, slug_title))
2819 request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2821 webpage = compat_urllib_request.urlopen(request).read()
2822 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2823 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2826 self.report_extraction('%s/%s' % (uploader, slug_title))
2828 # extract uid and stream token that soundcloud hands out for access
2829 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2831 video_id = mobj.group(1)
2832 stream_token = mobj.group(2)
2834 # extract unsimplified title
2835 mobj = re.search('"title":"(.*?)",', webpage)
2837 title = mobj.group(1).decode('utf-8')
2839 title = simple_title
2841 # construct media url (with uid/token)
2842 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2843 mediaURL = mediaURL % (video_id, stream_token)
2846 description = u'No description available'
2847 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2849 description = mobj.group(1)
2853 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2856 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2857 except Exception as err:
2858 self._downloader.to_stderr(compat_str(err))
2860 # for soundcloud, a request to a cross domain is required for cookies
2861 request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2864 'id': video_id.decode('utf-8'),
2866 'uploader': uploader.decode('utf-8'),
2867 'upload_date': upload_date,
2870 'description': description.decode('utf-8')
2874 class InfoQIE(InfoExtractor):
2875 """Information extractor for infoq.com"""
2877 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2880 def report_webpage(self, video_id):
2881 """Report information extraction."""
2882 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2884 def report_extraction(self, video_id):
2885 """Report information extraction."""
2886 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2888 def _real_extract(self, url):
2889 mobj = re.match(self._VALID_URL, url)
2891 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2894 self.report_webpage(url)
2896 request = compat_urllib_request.Request(url)
2898 webpage = compat_urllib_request.urlopen(request).read()
2899 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2900 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2903 self.report_extraction(url)
2907 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2909 self._downloader.trouble(u'ERROR: unable to extract video url')
2911 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2915 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2917 self._downloader.trouble(u'ERROR: unable to extract video title')
2919 video_title = mobj.group(1).decode('utf-8')
2921 # Extract description
2922 video_description = u'No description available.'
2923 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2924 if mobj is not None:
2925 video_description = mobj.group(1).decode('utf-8')
2927 video_filename = video_url.split('/')[-1]
2928 video_id, extension = video_filename.split('.')
2934 'upload_date': None,
2935 'title': video_title,
2936 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2938 'description': video_description,
2943 class MixcloudIE(InfoExtractor):
2944 """Information extractor for www.mixcloud.com"""
2945 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2946 IE_NAME = u'mixcloud'
2948 def __init__(self, downloader=None):
2949 InfoExtractor.__init__(self, downloader)
2951 def report_download_json(self, file_id):
2952 """Report JSON download."""
2953 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2955 def report_extraction(self, file_id):
2956 """Report information extraction."""
2957 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2959 def get_urls(self, jsonData, fmt, bitrate='best'):
2960 """Get urls from 'audio_formats' section in json"""
2963 bitrate_list = jsonData[fmt]
2964 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2965 bitrate = max(bitrate_list) # select highest
2967 url_list = jsonData[fmt][bitrate]
2968 except TypeError: # we have no bitrate info.
2969 url_list = jsonData[fmt]
2972 def check_urls(self, url_list):
2973 """Returns 1st active url from list"""
2974 for url in url_list:
2976 compat_urllib_request.urlopen(url)
2978 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2983 def _print_formats(self, formats):
2984 print('Available formats:')
2985 for fmt in formats.keys():
2986 for b in formats[fmt]:
2988 ext = formats[fmt][b][0]
2989 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2990 except TypeError: # we have no bitrate info
2991 ext = formats[fmt][0]
2992 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2995 def _real_extract(self, url):
2996 mobj = re.match(self._VALID_URL, url)
2998 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3000 # extract uploader & filename from url
3001 uploader = mobj.group(1).decode('utf-8')
3002 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3004 # construct API request
3005 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3006 # retrieve .json file with links to files
3007 request = compat_urllib_request.Request(file_url)
3009 self.report_download_json(file_url)
3010 jsonData = compat_urllib_request.urlopen(request).read()
3011 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3012 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3016 json_data = json.loads(jsonData)
3017 player_url = json_data['player_swf_url']
3018 formats = dict(json_data['audio_formats'])
3020 req_format = self._downloader.params.get('format', None)
3023 if self._downloader.params.get('listformats', None):
3024 self._print_formats(formats)
3027 if req_format is None or req_format == 'best':
3028 for format_param in formats.keys():
3029 url_list = self.get_urls(formats, format_param)
3031 file_url = self.check_urls(url_list)
3032 if file_url is not None:
3035 if req_format not in formats.keys():
3036 self._downloader.trouble(u'ERROR: format is not available')
3039 url_list = self.get_urls(formats, req_format)
3040 file_url = self.check_urls(url_list)
3041 format_param = req_format
3044 'id': file_id.decode('utf-8'),
3045 'url': file_url.decode('utf-8'),
3046 'uploader': uploader.decode('utf-8'),
3047 'upload_date': None,
3048 'title': json_data['name'],
3049 'ext': file_url.split('.')[-1].decode('utf-8'),
3050 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3051 'thumbnail': json_data['thumbnail_url'],
3052 'description': json_data['description'],
3053 'player_url': player_url.decode('utf-8'),
3056 class StanfordOpenClassroomIE(InfoExtractor):
3057 """Information extractor for Stanford's Open ClassRoom"""
3059 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3060 IE_NAME = u'stanfordoc'
3062 def report_download_webpage(self, objid):
3063 """Report information extraction."""
3064 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3066 def report_extraction(self, video_id):
3067 """Report information extraction."""
3068 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3070 def _real_extract(self, url):
3071 mobj = re.match(self._VALID_URL, url)
3073 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3076 if mobj.group('course') and mobj.group('video'): # A specific video
3077 course = mobj.group('course')
3078 video = mobj.group('video')
3080 'id': course + '_' + video,
3082 'upload_date': None,
3085 self.report_extraction(info['id'])
3086 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3087 xmlUrl = baseUrl + video + '.xml'
3089 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3090 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3093 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3095 info['title'] = mdoc.findall('./title')[0].text
3096 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3098 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3100 info['ext'] = info['url'].rpartition('.')[2]
3102 elif mobj.group('course'): # A course page
3103 course = mobj.group('course')
3108 'upload_date': None,
3111 self.report_download_webpage(info['id'])
3113 coursepage = compat_urllib_request.urlopen(url).read()
3114 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3115 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3118 m = re.search('<h1>([^<]+)</h1>', coursepage)
3120 info['title'] = unescapeHTML(m.group(1))
3122 info['title'] = info['id']
3124 m = re.search('<description>([^<]+)</description>', coursepage)
3126 info['description'] = unescapeHTML(m.group(1))
3128 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3131 'type': 'reference',
3132 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3136 for entry in info['list']:
3137 assert entry['type'] == 'reference'
3138 results += self.extract(entry['url'])
3143 'id': 'Stanford OpenClassroom',
3146 'upload_date': None,
3149 self.report_download_webpage(info['id'])
3150 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3152 rootpage = compat_urllib_request.urlopen(rootURL).read()
3153 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3154 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3157 info['title'] = info['id']
3159 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3162 'type': 'reference',
3163 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3168 for entry in info['list']:
3169 assert entry['type'] == 'reference'
3170 results += self.extract(entry['url'])
3173 class MTVIE(InfoExtractor):
3174 """Information extractor for MTV.com"""
3176 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3179 def report_webpage(self, video_id):
3180 """Report information extraction."""
3181 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3183 def report_extraction(self, video_id):
3184 """Report information extraction."""
3185 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3187 def _real_extract(self, url):
3188 mobj = re.match(self._VALID_URL, url)
3190 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3192 if not mobj.group('proto'):
3193 url = 'http://' + url
3194 video_id = mobj.group('videoid')
3195 self.report_webpage(video_id)
3197 request = compat_urllib_request.Request(url)
3199 webpage = compat_urllib_request.urlopen(request).read()
3200 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3201 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3204 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3206 self._downloader.trouble(u'ERROR: unable to extract song name')
3208 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3209 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3211 self._downloader.trouble(u'ERROR: unable to extract performer')
3213 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3214 video_title = performer + ' - ' + song_name
3216 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3218 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3220 mtvn_uri = mobj.group(1)
3222 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3224 self._downloader.trouble(u'ERROR: unable to extract content id')
3226 content_id = mobj.group(1)
3228 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3229 self.report_extraction(video_id)
3230 request = compat_urllib_request.Request(videogen_url)
3232 metadataXml = compat_urllib_request.urlopen(request).read()
3233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3234 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3237 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3238 renditions = mdoc.findall('.//rendition')
3240 # For now, always pick the highest quality.
3241 rendition = renditions[-1]
3244 _,_,ext = rendition.attrib['type'].partition('/')
3245 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3246 video_url = rendition.find('./src').text
3248 self._downloader.trouble('Invalid rendition field.')
3254 'uploader': performer,
3255 'upload_date': None,
3256 'title': video_title,
3264 class YoukuIE(InfoExtractor):
3266 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3269 def __init__(self, downloader=None):
3270 InfoExtractor.__init__(self, downloader)
3272 def report_download_webpage(self, file_id):
3273 """Report webpage download."""
3274 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3276 def report_extraction(self, file_id):
3277 """Report information extraction."""
3278 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3281 nowTime = int(time.time() * 1000)
3282 random1 = random.randint(1000,1998)
3283 random2 = random.randint(1000,9999)
3285 return "%d%d%d" %(nowTime,random1,random2)
3287 def _get_file_ID_mix_string(self, seed):
3289 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3291 for i in range(len(source)):
3292 seed = (seed * 211 + 30031 ) % 65536
3293 index = math.floor(seed / 65536 * len(source) )
3294 mixed.append(source[int(index)])
3295 source.remove(source[int(index)])
3296 #return ''.join(mixed)
3299 def _get_file_id(self, fileId, seed):
3300 mixed = self._get_file_ID_mix_string(seed)
3301 ids = fileId.split('*')
3305 realId.append(mixed[int(ch)])
3306 return ''.join(realId)
3308 def _real_extract(self, url):
3309 mobj = re.match(self._VALID_URL, url)
3311 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3313 video_id = mobj.group('ID')
3315 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3317 request = compat_urllib_request.Request(info_url, None, std_headers)
3319 self.report_download_webpage(video_id)
3320 jsondata = compat_urllib_request.urlopen(request).read()
3321 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3322 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3325 self.report_extraction(video_id)
3327 config = json.loads(jsondata)
3329 video_title = config['data'][0]['title']
3330 seed = config['data'][0]['seed']
3332 format = self._downloader.params.get('format', None)
3333 supported_format = config['data'][0]['streamfileids'].keys()
3335 if format is None or format == 'best':
3336 if 'hd2' in supported_format:
3341 elif format == 'worst':
3349 fileid = config['data'][0]['streamfileids'][format]
3350 seg_number = len(config['data'][0]['segs'][format])
3353 for i in xrange(seg_number):
3354 keys.append(config['data'][0]['segs'][format][i]['k'])
3357 #youku only could be viewed from mainland china
3359 self._downloader.trouble(u'ERROR: unable to extract info section')
3363 sid = self._gen_sid()
3364 fileid = self._get_file_id(fileid, seed)
3366 #column 8,9 of fileid represent the segment number
3367 #fileid[7:9] should be changed
3368 for index, key in enumerate(keys):
3370 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3371 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3374 'id': '%s_part%02d' % (video_id, index),
3375 'url': download_url,
3377 'upload_date': None,
3378 'title': video_title,
3381 files_info.append(info)
3386 class XNXXIE(InfoExtractor):
3387 """Information extractor for xnxx.com"""
3389 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3391 VIDEO_URL_RE = r'flv_url=(.*?)&'
3392 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3393 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3395 def report_webpage(self, video_id):
3396 """Report information extraction"""
3397 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3399 def report_extraction(self, video_id):
3400 """Report information extraction"""
3401 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3403 def _real_extract(self, url):
3404 mobj = re.match(self._VALID_URL, url)
3406 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3408 video_id = mobj.group(1).decode('utf-8')
3410 self.report_webpage(video_id)
3412 # Get webpage content
3414 webpage = compat_urllib_request.urlopen(url).read()
3415 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3416 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3419 result = re.search(self.VIDEO_URL_RE, webpage)
3421 self._downloader.trouble(u'ERROR: unable to extract video url')
3423 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3425 result = re.search(self.VIDEO_TITLE_RE, webpage)
3427 self._downloader.trouble(u'ERROR: unable to extract video title')
3429 video_title = result.group(1).decode('utf-8')
3431 result = re.search(self.VIDEO_THUMB_RE, webpage)
3433 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3435 video_thumbnail = result.group(1).decode('utf-8')
3441 'upload_date': None,
3442 'title': video_title,
3444 'thumbnail': video_thumbnail,
3445 'description': None,
3449 class GooglePlusIE(InfoExtractor):
3450 """Information extractor for plus.google.com."""
3452 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3453 IE_NAME = u'plus.google'
3455 def __init__(self, downloader=None):
3456 InfoExtractor.__init__(self, downloader)
3458 def report_extract_entry(self, url):
3459 """Report downloading extry"""
3460 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3462 def report_date(self, upload_date):
3463 """Report downloading extry"""
3464 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3466 def report_uploader(self, uploader):
3467 """Report downloading extry"""
3468 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3470 def report_title(self, video_title):
3471 """Report downloading extry"""
3472 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3474 def report_extract_vid_page(self, video_page):
3475 """Report information extraction."""
3476 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3478 def _real_extract(self, url):
3479 # Extract id from URL
3480 mobj = re.match(self._VALID_URL, url)
3482 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3485 post_url = mobj.group(0)
3486 video_id = mobj.group(2)
3488 video_extension = 'flv'
3490 # Step 1, Retrieve post webpage to extract further information
3491 self.report_extract_entry(post_url)
3492 request = compat_urllib_request.Request(post_url)
3494 webpage = compat_urllib_request.urlopen(request).read()
3495 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3496 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3499 # Extract update date
3501 pattern = 'title="Timestamp">(.*?)</a>'
3502 mobj = re.search(pattern, webpage)
3504 upload_date = mobj.group(1)
3505 # Convert timestring to a format suitable for filename
3506 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3507 upload_date = upload_date.strftime('%Y%m%d')
3508 self.report_date(upload_date)
3512 pattern = r'rel\="author".*?>(.*?)</a>'
3513 mobj = re.search(pattern, webpage)
3515 uploader = mobj.group(1)
3516 self.report_uploader(uploader)
3519 # Get the first line for title
3521 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3522 mobj = re.search(pattern, webpage)
3524 video_title = mobj.group(1)
3525 self.report_title(video_title)
3527 # Step 2, Stimulate clicking the image box to launch video
3528 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3529 mobj = re.search(pattern, webpage)
3531 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3533 video_page = mobj.group(1)
3534 request = compat_urllib_request.Request(video_page)
3536 webpage = compat_urllib_request.urlopen(request).read()
3537 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3538 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3540 self.report_extract_vid_page(video_page)
3543 # Extract video links on video page
3544 """Extract video links of all sizes"""
3545 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3546 mobj = re.findall(pattern, webpage)
3548 self._downloader.trouble(u'ERROR: unable to extract video links')
3550 # Sort in resolution
3551 links = sorted(mobj)
3553 # Choose the lowest of the sort, i.e. highest resolution
3554 video_url = links[-1]
3555 # Only get the url. The resolution part in the tuple has no use anymore
3556 video_url = video_url[-1]
3557 # Treat escaped \u0026 style hex
3558 video_url = unicode(video_url, "unicode_escape")
3562 'id': video_id.decode('utf-8'),
3564 'uploader': uploader.decode('utf-8'),
3565 'upload_date': upload_date.decode('utf-8'),
3566 'title': video_title.decode('utf-8'),
3567 'ext': video_extension.decode('utf-8'),