2 # -*- coding: utf-8 -*-
13 import xml.etree.ElementTree
16 from urlparse import parse_qs
19 import cStringIO as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 uploader: Nickname of the video uploader, unescaped.
42 upload_date: Video upload date (YYYYMMDD).
43 title: Video title, unescaped.
44 ext: Video filename extension.
46 The following fields are optional:
48 format: The video format, defaults to ext (used for --get-format)
49 thumbnail: Full URL to a video thumbnail image.
50 description: One-line video description.
51 player_url: SWF Player URL (used for rtmpdump).
52 subtitles: The .srt file contents.
53 urlhandle: [internal] The urlHandle to be used to download the file,
54 like returned by urllib.request.urlopen
56 The fields should all be Unicode strings.
58 Subclasses of this one should re-define the _real_initialize() and
59 _real_extract() methods and define a _VALID_URL regexp.
60 Probably, they should also be added to the list of extractors.
62 _real_extract() must return a *list* of information dictionaries as
65 Finally, the _WORKING attribute should be set to False for broken IEs
66 in order to warn the users and skip the tests.
73 def __init__(self, downloader=None):
74 """Constructor. Receives an optional downloader."""
76 self.set_downloader(downloader)
78 def suitable(self, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(self._VALID_URL, url) is not None
83 """Getter method for _WORKING."""
87 """Initializes an instance (authentication, etc)."""
89 self._real_initialize()
92 def extract(self, url):
93 """Extracts URL information and returns it in list of dicts."""
95 return self._real_extract(url)
97 def set_downloader(self, downloader):
98 """Sets the downloader for this IE."""
99 self._downloader = downloader
101 def _real_initialize(self):
102 """Real initialization process. Redefine in subclasses."""
105 def _real_extract(self, url):
106 """Real extraction process. Redefine in subclasses."""
110 class YoutubeIE(InfoExtractor):
111 """Information extractor for youtube.com."""
115 (?:https?://)? # http(s):// (optional)
116 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
117 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
118 (?:.*?\#/)? # handle anchor (#/) redirect urls
119 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
120 (?: # the various things that can precede the ID:
121 (?:(?:v|embed|e)/) # v/ or embed/ or e/
122 |(?: # or the v= param in all its forms
123 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
124 (?:\?|\#!?) # the params delimiter ? or # or #!
125 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
128 )? # optional -> youtube.com/xxxx is OK
129 )? # all until now is optional -> you can pass the naked ID
130 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
131 (?(1).+)? # if we found the ID, everything can follow
133 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
134 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
135 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
136 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
137 _NETRC_MACHINE = 'youtube'
138 # Listed in order of quality
139 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
140 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
141 _video_extensions = {
147 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
153 _video_dimensions = {
171 def suitable(self, url):
172 """Receives a URL and returns True if suitable for this IE."""
173 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
175 def report_lang(self):
176 """Report attempt to set language."""
177 self._downloader.to_screen(u'[youtube] Setting language')
179 def report_login(self):
180 """Report attempt to log in."""
181 self._downloader.to_screen(u'[youtube] Logging in')
183 def report_age_confirmation(self):
184 """Report attempt to confirm age."""
185 self._downloader.to_screen(u'[youtube] Confirming age')
187 def report_video_webpage_download(self, video_id):
188 """Report attempt to download video webpage."""
189 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
191 def report_video_info_webpage_download(self, video_id):
192 """Report attempt to download video info webpage."""
193 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
195 def report_video_subtitles_download(self, video_id):
196 """Report attempt to download video info webpage."""
197 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
199 def report_information_extraction(self, video_id):
200 """Report attempt to extract video information."""
201 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
203 def report_unavailable_format(self, video_id, format):
204 """Report extracted video URL."""
205 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
207 def report_rtmp_download(self):
208 """Indicate the download will use the RTMP protocol."""
209 self._downloader.to_screen(u'[youtube] RTMP download detected')
211 def _closed_captions_xml_to_srt(self, xml_string):
213 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
214 # TODO parse xml instead of regex
215 for n, (start, dur_tag, dur, caption) in enumerate(texts):
216 if not dur: dur = '4'
218 end = start + float(dur)
219 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
220 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
221 caption = unescapeHTML(caption)
222 caption = unescapeHTML(caption) # double cycle, intentional
223 srt += str(n+1) + '\n'
224 srt += start + ' --> ' + end + '\n'
225 srt += caption + '\n\n'
228 def _print_formats(self, formats):
229 print('Available formats:')
231 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
233 def _real_initialize(self):
234 if self._downloader is None:
239 downloader_params = self._downloader.params
241 # Attempt to use provided username and password or .netrc data
242 if downloader_params.get('username', None) is not None:
243 username = downloader_params['username']
244 password = downloader_params['password']
245 elif downloader_params.get('usenetrc', False):
247 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
252 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
253 except (IOError, netrc.NetrcParseError) as err:
254 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
258 request = compat_urllib_request.Request(self._LANG_URL)
261 compat_urllib_request.urlopen(request).read()
262 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
263 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
266 # No authentication to be performed
272 'current_form': 'loginForm',
274 'action_login': 'Log In',
275 'username': username,
276 'password': password,
278 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
281 login_results = compat_urllib_request.urlopen(request).read()
282 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
283 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
285 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
286 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
292 'action_confirm': 'Confirm',
294 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
296 self.report_age_confirmation()
297 age_results = compat_urllib_request.urlopen(request).read()
298 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
299 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
302 def _real_extract(self, url):
303 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
304 mobj = re.search(self._NEXT_URL_RE, url)
306 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
308 # Extract video id from URL
309 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
311 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
313 video_id = mobj.group(2)
316 self.report_video_webpage_download(video_id)
317 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
319 video_webpage = compat_urllib_request.urlopen(request).read()
320 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
321 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
324 # Attempt to extract SWF player URL
325 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
327 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
332 self.report_video_info_webpage_download(video_id)
333 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
334 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
335 % (video_id, el_type))
336 request = compat_urllib_request.Request(video_info_url)
338 video_info_webpage = compat_urllib_request.urlopen(request).read()
339 video_info = parse_qs(video_info_webpage)
340 if 'token' in video_info:
342 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
343 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
345 if 'token' not in video_info:
346 if 'reason' in video_info:
347 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
349 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
352 # Check for "rental" videos
353 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
354 self._downloader.trouble(u'ERROR: "rental" videos not supported')
357 # Start extracting information
358 self.report_information_extraction(video_id)
361 if 'author' not in video_info:
362 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
364 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
367 if 'title' not in video_info:
368 self._downloader.trouble(u'ERROR: unable to extract video title')
370 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
371 video_title = video_title.decode('utf-8')
374 if 'thumbnail_url' not in video_info:
375 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
377 else: # don't panic if we can't find it
378 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
382 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
384 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
385 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
386 for expression in format_expressions:
388 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
393 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
394 if video_description: video_description = clean_html(video_description)
395 else: video_description = ''
398 video_subtitles = None
399 if self._downloader.params.get('writesubtitles', False):
401 self.report_video_subtitles_download(video_id)
402 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
404 srt_list = compat_urllib_request.urlopen(request).read()
405 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
406 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
407 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
408 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
409 if not srt_lang_list:
410 raise Trouble(u'WARNING: video has no closed captions')
411 if self._downloader.params.get('subtitleslang', False):
412 srt_lang = self._downloader.params.get('subtitleslang')
413 elif 'en' in srt_lang_list:
416 srt_lang = srt_lang_list.keys()[0]
417 if not srt_lang in srt_lang_list:
418 raise Trouble(u'WARNING: no closed captions found in the specified language')
419 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
421 srt_xml = compat_urllib_request.urlopen(request).read()
422 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
423 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
425 raise Trouble(u'WARNING: unable to download video subtitles')
426 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
427 except Trouble as trouble:
428 self._downloader.trouble(trouble[0])
430 if 'length_seconds' not in video_info:
431 self._downloader.trouble(u'WARNING: unable to extract video duration')
434 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
437 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
439 # Decide which formats to download
440 req_format = self._downloader.params.get('format', None)
442 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
443 self.report_rtmp_download()
444 video_url_list = [(None, video_info['conn'][0])]
445 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
446 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
447 url_data = [parse_qs(uds) for uds in url_data_strs]
448 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
449 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
451 format_limit = self._downloader.params.get('format_limit', None)
452 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
453 if format_limit is not None and format_limit in available_formats:
454 format_list = available_formats[available_formats.index(format_limit):]
456 format_list = available_formats
457 existing_formats = [x for x in format_list if x in url_map]
458 if len(existing_formats) == 0:
459 self._downloader.trouble(u'ERROR: no known formats available for video')
461 if self._downloader.params.get('listformats', None):
462 self._print_formats(existing_formats)
464 if req_format is None or req_format == 'best':
465 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
466 elif req_format == 'worst':
467 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
468 elif req_format in ('-1', 'all'):
469 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
471 # Specific formats. We pick the first in a slash-delimeted sequence.
472 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
473 req_formats = req_format.split('/')
474 video_url_list = None
475 for rf in req_formats:
477 video_url_list = [(rf, url_map[rf])]
479 if video_url_list is None:
480 self._downloader.trouble(u'ERROR: requested format not available')
483 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
487 for format_param, video_real_url in video_url_list:
489 video_extension = self._video_extensions.get(format_param, 'flv')
491 video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
492 self._video_dimensions.get(format_param, '???'))
495 'id': video_id.decode('utf-8'),
496 'url': video_real_url.decode('utf-8'),
497 'uploader': video_uploader.decode('utf-8'),
498 'upload_date': upload_date,
499 'title': video_title,
500 'ext': video_extension.decode('utf-8'),
501 'format': video_format,
502 'thumbnail': video_thumbnail.decode('utf-8'),
503 'description': video_description,
504 'player_url': player_url,
505 'subtitles': video_subtitles,
506 'duration': video_duration
511 class MetacafeIE(InfoExtractor):
512 """Information Extractor for metacafe.com."""
514 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
515 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
516 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
517 IE_NAME = u'metacafe'
519 def __init__(self, downloader=None):
520 InfoExtractor.__init__(self, downloader)
522 def report_disclaimer(self):
523 """Report disclaimer retrieval."""
524 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
526 def report_age_confirmation(self):
527 """Report attempt to confirm age."""
528 self._downloader.to_screen(u'[metacafe] Confirming age')
530 def report_download_webpage(self, video_id):
531 """Report webpage download."""
532 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
534 def report_extraction(self, video_id):
535 """Report information extraction."""
536 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
538 def _real_initialize(self):
539 # Retrieve disclaimer
540 request = compat_urllib_request.Request(self._DISCLAIMER)
542 self.report_disclaimer()
543 disclaimer = compat_urllib_request.urlopen(request).read()
544 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
545 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
551 'submit': "Continue - I'm over 18",
553 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
555 self.report_age_confirmation()
556 disclaimer = compat_urllib_request.urlopen(request).read()
557 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
558 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
561 def _real_extract(self, url):
562 # Extract id and simplified title from URL
563 mobj = re.match(self._VALID_URL, url)
565 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
568 video_id = mobj.group(1)
570 # Check if video comes from YouTube
571 mobj2 = re.match(r'^yt-(.*)$', video_id)
572 if mobj2 is not None:
573 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
576 # Retrieve video webpage to extract further information
577 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
579 self.report_download_webpage(video_id)
580 webpage = compat_urllib_request.urlopen(request).read()
581 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
582 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
585 # Extract URL, uploader and title from webpage
586 self.report_extraction(video_id)
587 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
589 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
590 video_extension = mediaURL[-3:]
592 # Extract gdaKey if available
593 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
597 gdaKey = mobj.group(1)
598 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
600 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
602 self._downloader.trouble(u'ERROR: unable to extract media URL')
604 vardict = parse_qs(mobj.group(1))
605 if 'mediaData' not in vardict:
606 self._downloader.trouble(u'ERROR: unable to extract media URL')
608 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
610 self._downloader.trouble(u'ERROR: unable to extract media URL')
612 mediaURL = mobj.group(1).replace('\\/', '/')
613 video_extension = mediaURL[-3:]
614 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
616 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
618 self._downloader.trouble(u'ERROR: unable to extract title')
620 video_title = mobj.group(1).decode('utf-8')
622 mobj = re.search(r'submitter=(.*?);', webpage)
624 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
626 video_uploader = mobj.group(1)
629 'id': video_id.decode('utf-8'),
630 'url': video_url.decode('utf-8'),
631 'uploader': video_uploader.decode('utf-8'),
633 'title': video_title,
634 'ext': video_extension.decode('utf-8'),
638 class DailymotionIE(InfoExtractor):
639 """Information Extractor for Dailymotion"""
641 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
642 IE_NAME = u'dailymotion'
644 def __init__(self, downloader=None):
645 InfoExtractor.__init__(self, downloader)
647 def report_download_webpage(self, video_id):
648 """Report webpage download."""
649 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
651 def report_extraction(self, video_id):
652 """Report information extraction."""
653 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
655 def _real_extract(self, url):
656 # Extract id and simplified title from URL
657 mobj = re.match(self._VALID_URL, url)
659 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
662 video_id = mobj.group(1).split('_')[0].split('?')[0]
664 video_extension = 'mp4'
666 # Retrieve video webpage to extract further information
667 request = compat_urllib_request.Request(url)
668 request.add_header('Cookie', 'family_filter=off')
670 self.report_download_webpage(video_id)
671 webpage = compat_urllib_request.urlopen(request).read()
672 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
673 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
676 # Extract URL, uploader and title from webpage
677 self.report_extraction(video_id)
678 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
680 self._downloader.trouble(u'ERROR: unable to extract media URL')
682 flashvars = compat_urllib_parse.unquote(mobj.group(1))
684 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
687 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
690 self._downloader.trouble(u'ERROR: unable to extract video URL')
693 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
695 self._downloader.trouble(u'ERROR: unable to extract video URL')
698 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
700 # TODO: support choosing qualities
702 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
704 self._downloader.trouble(u'ERROR: unable to extract title')
706 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
708 video_uploader = None
709 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
711 # lookin for official user
712 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
713 if mobj_official is None:
714 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
716 video_uploader = mobj_official.group(1)
718 video_uploader = mobj.group(1)
720 video_upload_date = None
721 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
723 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
726 'id': video_id.decode('utf-8'),
727 'url': video_url.decode('utf-8'),
728 'uploader': video_uploader.decode('utf-8'),
729 'upload_date': video_upload_date,
730 'title': video_title,
731 'ext': video_extension.decode('utf-8'),
735 class GoogleIE(InfoExtractor):
736 """Information extractor for video.google.com."""
738 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
739 IE_NAME = u'video.google'
741 def __init__(self, downloader=None):
742 InfoExtractor.__init__(self, downloader)
744 def report_download_webpage(self, video_id):
745 """Report webpage download."""
746 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
748 def report_extraction(self, video_id):
749 """Report information extraction."""
750 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
752 def _real_extract(self, url):
753 # Extract id from URL
754 mobj = re.match(self._VALID_URL, url)
756 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
759 video_id = mobj.group(1)
761 video_extension = 'mp4'
763 # Retrieve video webpage to extract further information
764 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
766 self.report_download_webpage(video_id)
767 webpage = compat_urllib_request.urlopen(request).read()
768 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
769 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
772 # Extract URL, uploader, and title from webpage
773 self.report_extraction(video_id)
774 mobj = re.search(r"download_url:'([^']+)'", webpage)
776 video_extension = 'flv'
777 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
779 self._downloader.trouble(u'ERROR: unable to extract media URL')
781 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
782 mediaURL = mediaURL.replace('\\x3d', '\x3d')
783 mediaURL = mediaURL.replace('\\x26', '\x26')
787 mobj = re.search(r'<title>(.*)</title>', webpage)
789 self._downloader.trouble(u'ERROR: unable to extract title')
791 video_title = mobj.group(1).decode('utf-8')
793 # Extract video description
794 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
796 self._downloader.trouble(u'ERROR: unable to extract video description')
798 video_description = mobj.group(1).decode('utf-8')
799 if not video_description:
800 video_description = 'No description available.'
802 # Extract video thumbnail
803 if self._downloader.params.get('forcethumbnail', False):
804 request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
806 webpage = compat_urllib_request.urlopen(request).read()
807 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
808 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
810 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
812 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
814 video_thumbnail = mobj.group(1)
815 else: # we need something to pass to process_info
819 'id': video_id.decode('utf-8'),
820 'url': video_url.decode('utf-8'),
823 'title': video_title,
824 'ext': video_extension.decode('utf-8'),
828 class PhotobucketIE(InfoExtractor):
829 """Information extractor for photobucket.com."""
831 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
832 IE_NAME = u'photobucket'
834 def __init__(self, downloader=None):
835 InfoExtractor.__init__(self, downloader)
837 def report_download_webpage(self, video_id):
838 """Report webpage download."""
839 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
841 def report_extraction(self, video_id):
842 """Report information extraction."""
843 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
845 def _real_extract(self, url):
846 # Extract id from URL
847 mobj = re.match(self._VALID_URL, url)
849 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
852 video_id = mobj.group(1)
854 video_extension = 'flv'
856 # Retrieve video webpage to extract further information
857 request = compat_urllib_request.Request(url)
859 self.report_download_webpage(video_id)
860 webpage = compat_urllib_request.urlopen(request).read()
861 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
862 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
865 # Extract URL, uploader, and title from webpage
866 self.report_extraction(video_id)
867 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
869 self._downloader.trouble(u'ERROR: unable to extract media URL')
871 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
875 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
877 self._downloader.trouble(u'ERROR: unable to extract title')
879 video_title = mobj.group(1).decode('utf-8')
881 video_uploader = mobj.group(2).decode('utf-8')
884 'id': video_id.decode('utf-8'),
885 'url': video_url.decode('utf-8'),
886 'uploader': video_uploader,
888 'title': video_title,
889 'ext': video_extension.decode('utf-8'),
893 class YahooIE(InfoExtractor):
894 """Information extractor for video.yahoo.com."""
896 # _VALID_URL matches all Yahoo! Video URLs
897 # _VPAGE_URL matches only the extractable '/watch/' URLs
898 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
899 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
900 IE_NAME = u'video.yahoo'
902 def __init__(self, downloader=None):
903 InfoExtractor.__init__(self, downloader)
905 def report_download_webpage(self, video_id):
906 """Report webpage download."""
907 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
909 def report_extraction(self, video_id):
910 """Report information extraction."""
911 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
913 def _real_extract(self, url, new_video=True):
914 # Extract ID from URL
915 mobj = re.match(self._VALID_URL, url)
917 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
920 video_id = mobj.group(2)
921 video_extension = 'flv'
923 # Rewrite valid but non-extractable URLs as
924 # extractable English language /watch/ URLs
925 if re.match(self._VPAGE_URL, url) is None:
926 request = compat_urllib_request.Request(url)
928 webpage = compat_urllib_request.urlopen(request).read()
929 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
930 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
933 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
935 self._downloader.trouble(u'ERROR: Unable to extract id field')
937 yahoo_id = mobj.group(1)
939 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
941 self._downloader.trouble(u'ERROR: Unable to extract vid field')
943 yahoo_vid = mobj.group(1)
945 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
946 return self._real_extract(url, new_video=False)
948 # Retrieve video webpage to extract further information
949 request = compat_urllib_request.Request(url)
951 self.report_download_webpage(video_id)
952 webpage = compat_urllib_request.urlopen(request).read()
953 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
954 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
957 # Extract uploader and title from webpage
958 self.report_extraction(video_id)
959 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
961 self._downloader.trouble(u'ERROR: unable to extract video title')
963 video_title = mobj.group(1).decode('utf-8')
965 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
967 self._downloader.trouble(u'ERROR: unable to extract video uploader')
969 video_uploader = mobj.group(1).decode('utf-8')
971 # Extract video thumbnail
972 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
974 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
976 video_thumbnail = mobj.group(1).decode('utf-8')
978 # Extract video description
979 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
981 self._downloader.trouble(u'ERROR: unable to extract video description')
983 video_description = mobj.group(1).decode('utf-8')
984 if not video_description:
985 video_description = 'No description available.'
987 # Extract video height and width
988 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
990 self._downloader.trouble(u'ERROR: unable to extract video height')
992 yv_video_height = mobj.group(1)
994 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
996 self._downloader.trouble(u'ERROR: unable to extract video width')
998 yv_video_width = mobj.group(1)
1000 # Retrieve video playlist to extract media URL
1001 # I'm not completely sure what all these options are, but we
1002 # seem to need most of them, otherwise the server sends a 401.
1003 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1004 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1005 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1006 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1007 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1009 self.report_download_webpage(video_id)
1010 webpage = compat_urllib_request.urlopen(request).read()
1011 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1012 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1015 # Extract media URL from playlist XML
1016 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1018 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1020 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1021 video_url = unescapeHTML(video_url)
1024 'id': video_id.decode('utf-8'),
1026 'uploader': video_uploader,
1027 'upload_date': None,
1028 'title': video_title,
1029 'ext': video_extension.decode('utf-8'),
1030 'thumbnail': video_thumbnail.decode('utf-8'),
1031 'description': video_description,
1035 class VimeoIE(InfoExtractor):
1036 """Information extractor for vimeo.com."""
1038 # _VALID_URL matches Vimeo URLs
1039 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1042 def __init__(self, downloader=None):
1043 InfoExtractor.__init__(self, downloader)
1045 def report_download_webpage(self, video_id):
1046 """Report webpage download."""
1047 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1049 def report_extraction(self, video_id):
1050 """Report information extraction."""
1051 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1053 def _real_extract(self, url, new_video=True):
1054 # Extract ID from URL
1055 mobj = re.match(self._VALID_URL, url)
1057 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1060 video_id = mobj.group(1)
1062 # Retrieve video webpage to extract further information
1063 request = compat_urllib_request.Request(url, None, std_headers)
1065 self.report_download_webpage(video_id)
1066 webpage = compat_urllib_request.urlopen(request).read()
1067 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1068 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1071 # Now we begin extracting as much information as we can from what we
1072 # retrieved. First we extract the information common to all extractors,
1073 # and latter we extract those that are Vimeo specific.
1074 self.report_extraction(video_id)
1076 # Extract the config JSON
1077 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1079 config = json.loads(config)
1081 self._downloader.trouble(u'ERROR: unable to extract info section')
1085 video_title = config["video"]["title"]
1088 video_uploader = config["video"]["owner"]["name"]
1090 # Extract video thumbnail
1091 video_thumbnail = config["video"]["thumbnail"]
1093 # Extract video description
1094 video_description = get_element_by_id("description", webpage.decode('utf8'))
1095 if video_description: video_description = clean_html(video_description)
1096 else: video_description = ''
1098 # Extract upload date
1099 video_upload_date = None
1100 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1101 if mobj is not None:
1102 video_upload_date = mobj.group(1)
1104 # Vimeo specific: extract request signature and timestamp
1105 sig = config['request']['signature']
1106 timestamp = config['request']['timestamp']
1108 # Vimeo specific: extract video codec and quality information
1109 # First consider quality, then codecs, then take everything
1110 # TODO bind to format param
1111 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1112 files = { 'hd': [], 'sd': [], 'other': []}
1113 for codec_name, codec_extension in codecs:
1114 if codec_name in config["video"]["files"]:
1115 if 'hd' in config["video"]["files"][codec_name]:
1116 files['hd'].append((codec_name, codec_extension, 'hd'))
1117 elif 'sd' in config["video"]["files"][codec_name]:
1118 files['sd'].append((codec_name, codec_extension, 'sd'))
1120 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1122 for quality in ('hd', 'sd', 'other'):
1123 if len(files[quality]) > 0:
1124 video_quality = files[quality][0][2]
1125 video_codec = files[quality][0][0]
1126 video_extension = files[quality][0][1]
1127 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1130 self._downloader.trouble(u'ERROR: no known codec found')
1133 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1134 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1139 'uploader': video_uploader,
1140 'upload_date': video_upload_date,
1141 'title': video_title,
1142 'ext': video_extension,
1143 'thumbnail': video_thumbnail,
1144 'description': video_description,
1148 class ArteTvIE(InfoExtractor):
1149 """arte.tv information extractor."""
1151 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1152 _LIVE_URL = r'index-[0-9]+\.html$'
1154 IE_NAME = u'arte.tv'
1156 def __init__(self, downloader=None):
1157 InfoExtractor.__init__(self, downloader)
1159 def report_download_webpage(self, video_id):
1160 """Report webpage download."""
1161 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1163 def report_extraction(self, video_id):
1164 """Report information extraction."""
1165 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1167 def fetch_webpage(self, url):
1168 self._downloader.increment_downloads()
1169 request = compat_urllib_request.Request(url)
1171 self.report_download_webpage(url)
1172 webpage = compat_urllib_request.urlopen(request).read()
1173 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1174 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1176 except ValueError as err:
1177 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1181 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182 page = self.fetch_webpage(url)
1183 mobj = re.search(regex, page, regexFlags)
1187 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1190 for (i, key, err) in matchTuples:
1191 if mobj.group(i) is None:
1192 self._downloader.trouble(err)
1195 info[key] = mobj.group(i)
1199 def extractLiveStream(self, url):
1200 video_lang = url.split('/')[-4]
1201 info = self.grep_webpage(
1203 r'src="(.*?/videothek_js.*?\.js)',
1206 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1209 http_host = url.split('/')[2]
1210 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1211 info = self.grep_webpage(
1213 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1214 '(http://.*?\.swf).*?' +
1218 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1219 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1220 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1223 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1225 def extractPlus7Stream(self, url):
1226 video_lang = url.split('/')[-3]
1227 info = self.grep_webpage(
1229 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1232 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1235 next_url = compat_urllib_parse.unquote(info.get('url'))
1236 info = self.grep_webpage(
1238 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1241 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1244 next_url = compat_urllib_parse.unquote(info.get('url'))
1246 info = self.grep_webpage(
1248 r'<video id="(.*?)".*?>.*?' +
1249 '<name>(.*?)</name>.*?' +
1250 '<dateVideo>(.*?)</dateVideo>.*?' +
1251 '<url quality="hd">(.*?)</url>',
1254 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1255 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1256 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1257 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1262 'id': info.get('id'),
1263 'url': compat_urllib_parse.unquote(info.get('url')),
1264 'uploader': u'arte.tv',
1265 'upload_date': info.get('date'),
1266 'title': info.get('title'),
1272 def _real_extract(self, url):
1273 video_id = url.split('/')[-1]
1274 self.report_extraction(video_id)
1276 if re.search(self._LIVE_URL, video_id) is not None:
1277 self.extractLiveStream(url)
1280 info = self.extractPlus7Stream(url)
1285 class GenericIE(InfoExtractor):
1286 """Generic last-resort information extractor."""
1289 IE_NAME = u'generic'
1291 def __init__(self, downloader=None):
1292 InfoExtractor.__init__(self, downloader)
1294 def report_download_webpage(self, video_id):
1295 """Report webpage download."""
1296 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1297 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1299 def report_extraction(self, video_id):
1300 """Report information extraction."""
1301 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1303 def report_following_redirect(self, new_url):
1304 """Report information extraction."""
1305 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1307 def _test_redirect(self, url):
1308 """Check if it is a redirect, like url shorteners, in case restart chain."""
1309 class HeadRequest(compat_urllib_request.Request):
1310 def get_method(self):
1313 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1315 Subclass the HTTPRedirectHandler to make it use our
1316 HeadRequest also on the redirected URL
1318 def redirect_request(self, req, fp, code, msg, headers, newurl):
1319 if code in (301, 302, 303, 307):
1320 newurl = newurl.replace(' ', '%20')
1321 newheaders = dict((k,v) for k,v in req.headers.items()
1322 if k.lower() not in ("content-length", "content-type"))
1323 return HeadRequest(newurl,
1325 origin_req_host=req.get_origin_req_host(),
1328 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1330 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1332 Fallback to GET if HEAD is not allowed (405 HTTP error)
1334 def http_error_405(self, req, fp, code, msg, headers):
1338 newheaders = dict((k,v) for k,v in req.headers.items()
1339 if k.lower() not in ("content-length", "content-type"))
1340 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1342 origin_req_host=req.get_origin_req_host(),
1346 opener = compat_urllib_request.OpenerDirector()
1347 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1348 HTTPMethodFallback, HEADRedirectHandler,
1349 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1350 opener.add_handler(handler())
1352 response = opener.open(HeadRequest(url))
1353 new_url = response.geturl()
1358 self.report_following_redirect(new_url)
1359 self._downloader.download([new_url])
1362 def _real_extract(self, url):
1363 if self._test_redirect(url): return
1365 video_id = url.split('/')[-1]
1366 request = compat_urllib_request.Request(url)
1368 self.report_download_webpage(video_id)
1369 webpage = compat_urllib_request.urlopen(request).read()
1370 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1371 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1373 except ValueError as err:
1374 # since this is the last-resort InfoExtractor, if
1375 # this error is thrown, it'll be thrown here
1376 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1379 self.report_extraction(video_id)
1380 # Start with something easy: JW Player in SWFObject
1381 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1383 # Broaden the search a little bit
1384 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1386 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1389 # It's possible that one of the regexes
1390 # matched, but returned an empty group:
1391 if mobj.group(1) is None:
1392 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1395 video_url = compat_urllib_parse.unquote(mobj.group(1))
1396 video_id = os.path.basename(video_url)
1398 # here's a fun little line of code for you:
1399 video_extension = os.path.splitext(video_id)[1][1:]
1400 video_id = os.path.splitext(video_id)[0]
1402 # it's tempting to parse this further, but you would
1403 # have to take into account all the variations like
1404 # Video Title - Site Name
1405 # Site Name | Video Title
1406 # Video Title - Tagline | Site Name
1407 # and so on and so forth; it's just not practical
1408 mobj = re.search(r'<title>(.*)</title>', webpage)
1410 self._downloader.trouble(u'ERROR: unable to extract title')
1412 video_title = mobj.group(1).decode('utf-8')
1414 # video uploader is domain name
1415 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1417 self._downloader.trouble(u'ERROR: unable to extract title')
1419 video_uploader = mobj.group(1).decode('utf-8')
1422 'id': video_id.decode('utf-8'),
1423 'url': video_url.decode('utf-8'),
1424 'uploader': video_uploader,
1425 'upload_date': None,
1426 'title': video_title,
1427 'ext': video_extension.decode('utf-8'),
1431 class YoutubeSearchIE(InfoExtractor):
1432 """Information Extractor for YouTube search queries."""
1433 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1434 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1435 _max_youtube_results = 1000
1436 IE_NAME = u'youtube:search'
1438 def __init__(self, downloader=None):
1439 InfoExtractor.__init__(self, downloader)
1441 def report_download_page(self, query, pagenum):
1442 """Report attempt to download search page with given number."""
1443 query = query.decode(preferredencoding())
1444 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1446 def _real_extract(self, query):
1447 mobj = re.match(self._VALID_URL, query)
1449 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1452 prefix, query = query.split(':')
1454 query = query.encode('utf-8')
1456 self._download_n_results(query, 1)
1458 elif prefix == 'all':
1459 self._download_n_results(query, self._max_youtube_results)
1465 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1467 elif n > self._max_youtube_results:
1468 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1469 n = self._max_youtube_results
1470 self._download_n_results(query, n)
1472 except ValueError: # parsing prefix as integer fails
1473 self._download_n_results(query, 1)
1476 def _download_n_results(self, query, n):
1477 """Downloads a specified number of results for a query"""
1483 while (50 * pagenum) < limit:
1484 self.report_download_page(query, pagenum+1)
1485 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1486 request = compat_urllib_request.Request(result_url)
1488 data = compat_urllib_request.urlopen(request).read()
1489 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1490 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1492 api_response = json.loads(data)['data']
1494 new_ids = list(video['id'] for video in api_response['items'])
1495 video_ids += new_ids
1497 limit = min(n, api_response['totalItems'])
1500 if len(video_ids) > n:
1501 video_ids = video_ids[:n]
1502 for id in video_ids:
1503 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1507 class GoogleSearchIE(InfoExtractor):
1508 """Information Extractor for Google Video search queries."""
1509 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1510 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1511 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1512 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1513 _max_google_results = 1000
1514 IE_NAME = u'video.google:search'
1516 def __init__(self, downloader=None):
1517 InfoExtractor.__init__(self, downloader)
1519 def report_download_page(self, query, pagenum):
1520 """Report attempt to download playlist page with given number."""
1521 query = query.decode(preferredencoding())
1522 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1524 def _real_extract(self, query):
1525 mobj = re.match(self._VALID_URL, query)
1527 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1530 prefix, query = query.split(':')
1532 query = query.encode('utf-8')
1534 self._download_n_results(query, 1)
1536 elif prefix == 'all':
1537 self._download_n_results(query, self._max_google_results)
1543 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1545 elif n > self._max_google_results:
1546 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1547 n = self._max_google_results
1548 self._download_n_results(query, n)
1550 except ValueError: # parsing prefix as integer fails
1551 self._download_n_results(query, 1)
1554 def _download_n_results(self, query, n):
1555 """Downloads a specified number of results for a query"""
1561 self.report_download_page(query, pagenum)
1562 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1563 request = compat_urllib_request.Request(result_url)
1565 page = compat_urllib_request.urlopen(request).read()
1566 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1567 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1570 # Extract video identifiers
1571 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1572 video_id = mobj.group(1)
1573 if video_id not in video_ids:
1574 video_ids.append(video_id)
1575 if len(video_ids) == n:
1576 # Specified n videos reached
1577 for id in video_ids:
1578 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1581 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1582 for id in video_ids:
1583 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1586 pagenum = pagenum + 1
1589 class YahooSearchIE(InfoExtractor):
1590 """Information Extractor for Yahoo! Video search queries."""
1591 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1592 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1593 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1594 _MORE_PAGES_INDICATOR = r'\s*Next'
1595 _max_yahoo_results = 1000
1596 IE_NAME = u'video.yahoo:search'
1598 def __init__(self, downloader=None):
1599 InfoExtractor.__init__(self, downloader)
1601 def report_download_page(self, query, pagenum):
1602 """Report attempt to download playlist page with given number."""
1603 query = query.decode(preferredencoding())
1604 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1606 def _real_extract(self, query):
1607 mobj = re.match(self._VALID_URL, query)
1609 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1612 prefix, query = query.split(':')
1614 query = query.encode('utf-8')
1616 self._download_n_results(query, 1)
1618 elif prefix == 'all':
1619 self._download_n_results(query, self._max_yahoo_results)
1625 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1627 elif n > self._max_yahoo_results:
1628 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1629 n = self._max_yahoo_results
1630 self._download_n_results(query, n)
1632 except ValueError: # parsing prefix as integer fails
1633 self._download_n_results(query, 1)
1636 def _download_n_results(self, query, n):
1637 """Downloads a specified number of results for a query"""
1640 already_seen = set()
1644 self.report_download_page(query, pagenum)
1645 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1646 request = compat_urllib_request.Request(result_url)
1648 page = compat_urllib_request.urlopen(request).read()
1649 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1650 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1653 # Extract video identifiers
1654 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1655 video_id = mobj.group(1)
1656 if video_id not in already_seen:
1657 video_ids.append(video_id)
1658 already_seen.add(video_id)
1659 if len(video_ids) == n:
1660 # Specified n videos reached
1661 for id in video_ids:
1662 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1665 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1666 for id in video_ids:
1667 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1670 pagenum = pagenum + 1
1673 class YoutubePlaylistIE(InfoExtractor):
1674 """Information Extractor for YouTube playlists."""
1676 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1677 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1678 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1679 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1680 IE_NAME = u'youtube:playlist'
1682 def __init__(self, downloader=None):
1683 InfoExtractor.__init__(self, downloader)
1685 def report_download_page(self, playlist_id, pagenum):
1686 """Report attempt to download playlist page with given number."""
1687 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1689 def _real_extract(self, url):
1690 # Extract playlist id
1691 mobj = re.match(self._VALID_URL, url)
1693 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1697 if mobj.group(3) is not None:
1698 self._downloader.download([mobj.group(3)])
1701 # Download playlist pages
1702 # prefix is 'p' as default for playlists but there are other types that need extra care
1703 playlist_prefix = mobj.group(1)
1704 if playlist_prefix == 'a':
1705 playlist_access = 'artist'
1707 playlist_prefix = 'p'
1708 playlist_access = 'view_play_list'
1709 playlist_id = mobj.group(2)
1714 self.report_download_page(playlist_id, pagenum)
1715 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1716 request = compat_urllib_request.Request(url)
1718 page = compat_urllib_request.urlopen(request).read()
1719 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1720 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1723 # Extract video identifiers
1725 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1726 if mobj.group(1) not in ids_in_page:
1727 ids_in_page.append(mobj.group(1))
1728 video_ids.extend(ids_in_page)
1730 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1732 pagenum = pagenum + 1
1734 playliststart = self._downloader.params.get('playliststart', 1) - 1
1735 playlistend = self._downloader.params.get('playlistend', -1)
1736 if playlistend == -1:
1737 video_ids = video_ids[playliststart:]
1739 video_ids = video_ids[playliststart:playlistend]
1741 for id in video_ids:
1742 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1746 class YoutubeChannelIE(InfoExtractor):
1747 """Information Extractor for YouTube channels."""
1749 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1750 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1751 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1752 IE_NAME = u'youtube:channel'
1754 def report_download_page(self, channel_id, pagenum):
1755 """Report attempt to download channel page with given number."""
1756 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1758 def _real_extract(self, url):
1759 # Extract channel id
1760 mobj = re.match(self._VALID_URL, url)
1762 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1765 # Download channel pages
1766 channel_id = mobj.group(1)
1771 self.report_download_page(channel_id, pagenum)
1772 url = self._TEMPLATE_URL % (channel_id, pagenum)
1773 request = compat_urllib_request.Request(url)
1775 page = compat_urllib_request.urlopen(request).read()
1776 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1777 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1780 # Extract video identifiers
1782 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1783 if mobj.group(1) not in ids_in_page:
1784 ids_in_page.append(mobj.group(1))
1785 video_ids.extend(ids_in_page)
1787 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1789 pagenum = pagenum + 1
1791 for id in video_ids:
1792 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1796 class YoutubeUserIE(InfoExtractor):
1797 """Information Extractor for YouTube users."""
1799 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1800 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1801 _GDATA_PAGE_SIZE = 50
1802 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1803 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1804 IE_NAME = u'youtube:user'
1806 def __init__(self, downloader=None):
1807 InfoExtractor.__init__(self, downloader)
1809 def report_download_page(self, username, start_index):
1810 """Report attempt to download user page."""
1811 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1812 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1814 def _real_extract(self, url):
1816 mobj = re.match(self._VALID_URL, url)
1818 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1821 username = mobj.group(1)
1823 # Download video ids using YouTube Data API. Result size per
1824 # query is limited (currently to 50 videos) so we need to query
1825 # page by page until there are no video ids - it means we got
1832 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1833 self.report_download_page(username, start_index)
1835 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1838 page = compat_urllib_request.urlopen(request).read()
1839 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1840 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1843 # Extract video identifiers
1846 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1847 if mobj.group(1) not in ids_in_page:
1848 ids_in_page.append(mobj.group(1))
1850 video_ids.extend(ids_in_page)
1852 # A little optimization - if current page is not
1853 # "full", ie. does not contain PAGE_SIZE video ids then
1854 # we can assume that this page is the last one - there
1855 # are no more ids on further pages - no need to query
1858 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1863 all_ids_count = len(video_ids)
1864 playliststart = self._downloader.params.get('playliststart', 1) - 1
1865 playlistend = self._downloader.params.get('playlistend', -1)
1867 if playlistend == -1:
1868 video_ids = video_ids[playliststart:]
1870 video_ids = video_ids[playliststart:playlistend]
1872 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1873 (username, all_ids_count, len(video_ids)))
1875 for video_id in video_ids:
1876 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1879 class BlipTVUserIE(InfoExtractor):
1880 """Information Extractor for blip.tv users."""
1882 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1884 IE_NAME = u'blip.tv:user'
1886 def __init__(self, downloader=None):
1887 InfoExtractor.__init__(self, downloader)
1889 def report_download_page(self, username, pagenum):
1890 """Report attempt to download user page."""
1891 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1892 (self.IE_NAME, username, pagenum))
1894 def _real_extract(self, url):
1896 mobj = re.match(self._VALID_URL, url)
1898 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1901 username = mobj.group(1)
1903 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1905 request = compat_urllib_request.Request(url)
1908 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1909 mobj = re.search(r'data-users-id="([^"]+)"', page)
1910 page_base = page_base % mobj.group(1)
1911 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1912 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1916 # Download video ids using BlipTV Ajax calls. Result size per
1917 # query is limited (currently to 12 videos) so we need to query
1918 # page by page until there are no video ids - it means we got
1925 self.report_download_page(username, pagenum)
1927 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1930 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1931 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1932 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1935 # Extract video identifiers
1938 for mobj in re.finditer(r'href="/([^"]+)"', page):
1939 if mobj.group(1) not in ids_in_page:
1940 ids_in_page.append(unescapeHTML(mobj.group(1)))
1942 video_ids.extend(ids_in_page)
1944 # A little optimization - if current page is not
1945 # "full", ie. does not contain PAGE_SIZE video ids then
1946 # we can assume that this page is the last one - there
1947 # are no more ids on further pages - no need to query
1950 if len(ids_in_page) < self._PAGE_SIZE:
1955 all_ids_count = len(video_ids)
1956 playliststart = self._downloader.params.get('playliststart', 1) - 1
1957 playlistend = self._downloader.params.get('playlistend', -1)
1959 if playlistend == -1:
1960 video_ids = video_ids[playliststart:]
1962 video_ids = video_ids[playliststart:playlistend]
1964 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1965 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1967 for video_id in video_ids:
1968 self._downloader.download([u'http://blip.tv/'+video_id])
1971 class DepositFilesIE(InfoExtractor):
1972 """Information extractor for depositfiles.com"""
1974 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1975 IE_NAME = u'DepositFiles'
1977 def __init__(self, downloader=None):
1978 InfoExtractor.__init__(self, downloader)
1980 def report_download_webpage(self, file_id):
1981 """Report webpage download."""
1982 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1984 def report_extraction(self, file_id):
1985 """Report information extraction."""
1986 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1988 def _real_extract(self, url):
1989 file_id = url.split('/')[-1]
1990 # Rebuild url in english locale
1991 url = 'http://depositfiles.com/en/files/' + file_id
1993 # Retrieve file webpage with 'Free download' button pressed
1994 free_download_indication = { 'gateway_result' : '1' }
1995 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1997 self.report_download_webpage(file_id)
1998 webpage = compat_urllib_request.urlopen(request).read()
1999 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2000 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2003 # Search for the real file URL
2004 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2005 if (mobj is None) or (mobj.group(1) is None):
2006 # Try to figure out reason of the error.
2007 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2008 if (mobj is not None) and (mobj.group(1) is not None):
2009 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2010 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2012 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2015 file_url = mobj.group(1)
2016 file_extension = os.path.splitext(file_url)[1][1:]
2018 # Search for file title
2019 mobj = re.search(r'<b title="(.*?)">', webpage)
2021 self._downloader.trouble(u'ERROR: unable to extract title')
2023 file_title = mobj.group(1).decode('utf-8')
2026 'id': file_id.decode('utf-8'),
2027 'url': file_url.decode('utf-8'),
2029 'upload_date': None,
2030 'title': file_title,
2031 'ext': file_extension.decode('utf-8'),
2035 class FacebookIE(InfoExtractor):
2036 """Information Extractor for Facebook"""
2039 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2040 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2041 _NETRC_MACHINE = 'facebook'
2042 _available_formats = ['video', 'highqual', 'lowqual']
2043 _video_extensions = {
2048 IE_NAME = u'facebook'
2050 def __init__(self, downloader=None):
2051 InfoExtractor.__init__(self, downloader)
2053 def _reporter(self, message):
2054 """Add header and report message."""
2055 self._downloader.to_screen(u'[facebook] %s' % message)
2057 def report_login(self):
2058 """Report attempt to log in."""
2059 self._reporter(u'Logging in')
2061 def report_video_webpage_download(self, video_id):
2062 """Report attempt to download video webpage."""
2063 self._reporter(u'%s: Downloading video webpage' % video_id)
2065 def report_information_extraction(self, video_id):
2066 """Report attempt to extract video information."""
2067 self._reporter(u'%s: Extracting video information' % video_id)
2069 def _parse_page(self, video_webpage):
2070 """Extract video information from page"""
2072 data = {'title': r'\("video_title", "(.*?)"\)',
2073 'description': r'<div class="datawrap">(.*?)</div>',
2074 'owner': r'\("video_owner_name", "(.*?)"\)',
2075 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2078 for piece in data.keys():
2079 mobj = re.search(data[piece], video_webpage)
2080 if mobj is not None:
2081 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2085 for fmt in self._available_formats:
2086 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2087 if mobj is not None:
2088 # URL is in a Javascript segment inside an escaped Unicode format within
2089 # the generally utf-8 page
2090 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2091 video_info['video_urls'] = video_urls
2095 def _real_initialize(self):
2096 if self._downloader is None:
2101 downloader_params = self._downloader.params
2103 # Attempt to use provided username and password or .netrc data
2104 if downloader_params.get('username', None) is not None:
2105 useremail = downloader_params['username']
2106 password = downloader_params['password']
2107 elif downloader_params.get('usenetrc', False):
2109 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2110 if info is not None:
2114 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2115 except (IOError, netrc.NetrcParseError) as err:
2116 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2119 if useremail is None:
2128 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2131 login_results = compat_urllib_request.urlopen(request).read()
2132 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2133 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2135 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2136 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2139 def _real_extract(self, url):
2140 mobj = re.match(self._VALID_URL, url)
2142 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2144 video_id = mobj.group('ID')
2147 self.report_video_webpage_download(video_id)
2148 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2150 page = compat_urllib_request.urlopen(request)
2151 video_webpage = page.read()
2152 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2153 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2156 # Start extracting information
2157 self.report_information_extraction(video_id)
2159 # Extract information
2160 video_info = self._parse_page(video_webpage)
2163 if 'owner' not in video_info:
2164 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2166 video_uploader = video_info['owner']
2169 if 'title' not in video_info:
2170 self._downloader.trouble(u'ERROR: unable to extract video title')
2172 video_title = video_info['title']
2173 video_title = video_title.decode('utf-8')
2176 if 'thumbnail' not in video_info:
2177 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2178 video_thumbnail = ''
2180 video_thumbnail = video_info['thumbnail']
2184 if 'upload_date' in video_info:
2185 upload_time = video_info['upload_date']
2186 timetuple = email.utils.parsedate_tz(upload_time)
2187 if timetuple is not None:
2189 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2194 video_description = video_info.get('description', 'No description available.')
2196 url_map = video_info['video_urls']
2197 if len(url_map.keys()) > 0:
2198 # Decide which formats to download
2199 req_format = self._downloader.params.get('format', None)
2200 format_limit = self._downloader.params.get('format_limit', None)
2202 if format_limit is not None and format_limit in self._available_formats:
2203 format_list = self._available_formats[self._available_formats.index(format_limit):]
2205 format_list = self._available_formats
2206 existing_formats = [x for x in format_list if x in url_map]
2207 if len(existing_formats) == 0:
2208 self._downloader.trouble(u'ERROR: no known formats available for video')
2210 if req_format is None:
2211 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2212 elif req_format == 'worst':
2213 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2214 elif req_format == '-1':
2215 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2218 if req_format not in url_map:
2219 self._downloader.trouble(u'ERROR: requested format not available')
2221 video_url_list = [(req_format, url_map[req_format])] # Specific format
2224 for format_param, video_real_url in video_url_list:
2226 video_extension = self._video_extensions.get(format_param, 'mp4')
2229 'id': video_id.decode('utf-8'),
2230 'url': video_real_url.decode('utf-8'),
2231 'uploader': video_uploader.decode('utf-8'),
2232 'upload_date': upload_date,
2233 'title': video_title,
2234 'ext': video_extension.decode('utf-8'),
2235 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2236 'thumbnail': video_thumbnail.decode('utf-8'),
2237 'description': video_description.decode('utf-8'),
2241 class BlipTVIE(InfoExtractor):
2242 """Information extractor for blip.tv"""
2244 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2245 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2246 IE_NAME = u'blip.tv'
2248 def report_extraction(self, file_id):
2249 """Report information extraction."""
2250 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2252 def report_direct_download(self, title):
2253 """Report information extraction."""
2254 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2256 def _real_extract(self, url):
2257 mobj = re.match(self._VALID_URL, url)
2259 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2266 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2267 request = compat_urllib_request.Request(json_url.encode('utf-8'))
2268 self.report_extraction(mobj.group(1))
2271 urlh = compat_urllib_request.urlopen(request)
2272 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2273 basename = url.split('/')[-1]
2274 title,ext = os.path.splitext(basename)
2275 title = title.decode('UTF-8')
2276 ext = ext.replace('.', '')
2277 self.report_direct_download(title)
2282 'upload_date': None,
2287 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2288 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2290 if info is None: # Regular URL
2292 json_code = urlh.read()
2293 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2294 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2298 json_data = json.loads(json_code)
2299 if 'Post' in json_data:
2300 data = json_data['Post']
2304 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2305 video_url = data['media']['url']
2306 umobj = re.match(self._URL_EXT, video_url)
2308 raise ValueError('Can not determine filename extension')
2309 ext = umobj.group(1)
2312 'id': data['item_id'],
2314 'uploader': data['display_name'],
2315 'upload_date': upload_date,
2316 'title': data['title'],
2318 'format': data['media']['mimeType'],
2319 'thumbnail': data['thumbnailUrl'],
2320 'description': data['description'],
2321 'player_url': data['embedUrl']
2323 except (ValueError,KeyError) as err:
2324 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2327 std_headers['User-Agent'] = 'iTunes/10.6.1'
2331 class MyVideoIE(InfoExtractor):
2332 """Information Extractor for myvideo.de."""
2334 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2335 IE_NAME = u'myvideo'
2337 def __init__(self, downloader=None):
2338 InfoExtractor.__init__(self, downloader)
2340 def report_download_webpage(self, video_id):
2341 """Report webpage download."""
2342 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2344 def report_extraction(self, video_id):
2345 """Report information extraction."""
2346 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2348 def _real_extract(self,url):
2349 mobj = re.match(self._VALID_URL, url)
2351 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2354 video_id = mobj.group(1)
2357 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2359 self.report_download_webpage(video_id)
2360 webpage = compat_urllib_request.urlopen(request).read()
2361 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2362 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2365 self.report_extraction(video_id)
2366 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2369 self._downloader.trouble(u'ERROR: unable to extract media URL')
2371 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2373 mobj = re.search('<title>([^<]+)</title>', webpage)
2375 self._downloader.trouble(u'ERROR: unable to extract title')
2378 video_title = mobj.group(1)
2384 'upload_date': None,
2385 'title': video_title,
2389 class ComedyCentralIE(InfoExtractor):
2390 """Information extractor for The Daily Show and Colbert Report """
2392 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2393 IE_NAME = u'comedycentral'
2395 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2397 _video_extensions = {
2405 _video_dimensions = {
2414 def report_extraction(self, episode_id):
2415 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2417 def report_config_download(self, episode_id):
2418 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2420 def report_index_download(self, episode_id):
2421 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2423 def report_player_url(self, episode_id):
2424 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2427 def _print_formats(self, formats):
2428 print('Available formats:')
2430 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2433 def _real_extract(self, url):
2434 mobj = re.match(self._VALID_URL, url)
2436 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2439 if mobj.group('shortname'):
2440 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2441 url = u'http://www.thedailyshow.com/full-episodes/'
2443 url = u'http://www.colbertnation.com/full-episodes/'
2444 mobj = re.match(self._VALID_URL, url)
2445 assert mobj is not None
2447 dlNewest = not mobj.group('episode')
2449 epTitle = mobj.group('showname')
2451 epTitle = mobj.group('episode')
2453 req = compat_urllib_request.Request(url)
2454 self.report_extraction(epTitle)
2456 htmlHandle = compat_urllib_request.urlopen(req)
2457 html = htmlHandle.read()
2458 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2459 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2462 url = htmlHandle.geturl()
2463 mobj = re.match(self._VALID_URL, url)
2465 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2467 if mobj.group('episode') == '':
2468 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2470 epTitle = mobj.group('episode')
2472 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2474 if len(mMovieParams) == 0:
2475 # The Colbert Report embeds the information in a without
2476 # a URL prefix; so extract the alternate reference
2477 # and then add the URL prefix manually.
2479 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2480 if len(altMovieParams) == 0:
2481 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2484 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2486 playerUrl_raw = mMovieParams[0][0]
2487 self.report_player_url(epTitle)
2489 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2490 playerUrl = urlHandle.geturl()
2491 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2492 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2495 uri = mMovieParams[0][1]
2496 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2497 self.report_index_download(epTitle)
2499 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2500 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2501 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2506 idoc = xml.etree.ElementTree.fromstring(indexXml)
2507 itemEls = idoc.findall('.//item')
2508 for itemEl in itemEls:
2509 mediaId = itemEl.findall('./guid')[0].text
2510 shortMediaId = mediaId.split(':')[-1]
2511 showId = mediaId.split(':')[-2].replace('.com', '')
2512 officialTitle = itemEl.findall('./title')[0].text
2513 officialDate = itemEl.findall('./pubDate')[0].text
2515 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2516 compat_urllib_parse.urlencode({'uri': mediaId}))
2517 configReq = compat_urllib_request.Request(configUrl)
2518 self.report_config_download(epTitle)
2520 configXml = compat_urllib_request.urlopen(configReq).read()
2521 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2522 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2525 cdoc = xml.etree.ElementTree.fromstring(configXml)
2527 for rendition in cdoc.findall('.//rendition'):
2528 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2532 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2535 if self._downloader.params.get('listformats', None):
2536 self._print_formats([i[0] for i in turls])
2539 # For now, just pick the highest bitrate
2540 format,video_url = turls[-1]
2542 # Get the format arg from the arg stream
2543 req_format = self._downloader.params.get('format', None)
2545 # Select format if we can find one
2548 format, video_url = f, v
2551 # Patch to download from alternative CDN, which does not
2552 # break on current RTMPDump builds
2553 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2554 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2556 if video_url.startswith(broken_cdn):
2557 video_url = video_url.replace(broken_cdn, better_cdn)
2559 effTitle = showId + u'-' + epTitle
2564 'upload_date': officialDate,
2569 'description': officialTitle,
2570 'player_url': None #playerUrl
2573 results.append(info)
2578 class EscapistIE(InfoExtractor):
2579 """Information extractor for The Escapist """
2581 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2582 IE_NAME = u'escapist'
2584 def report_extraction(self, showName):
2585 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2587 def report_config_download(self, showName):
2588 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2590 def _real_extract(self, url):
2591 mobj = re.match(self._VALID_URL, url)
2593 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2595 showName = mobj.group('showname')
2596 videoId = mobj.group('episode')
2598 self.report_extraction(showName)
2600 webPage = compat_urllib_request.urlopen(url)
2601 webPageBytes = webPage.read()
2602 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2603 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2604 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2605 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2608 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2609 description = unescapeHTML(descMatch.group(1))
2610 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2611 imgUrl = unescapeHTML(imgMatch.group(1))
2612 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2613 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2614 configUrlMatch = re.search('config=(.*)$', playerUrl)
2615 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2617 self.report_config_download(showName)
2619 configJSON = compat_urllib_request.urlopen(configUrl).read()
2620 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2621 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2624 # Technically, it's JavaScript, not JSON
2625 configJSON = configJSON.replace("'", '"')
2628 config = json.loads(configJSON)
2629 except (ValueError,) as err:
2630 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2633 playlist = config['playlist']
2634 videoUrl = playlist[1]['url']
2639 'uploader': showName,
2640 'upload_date': None,
2643 'thumbnail': imgUrl,
2644 'description': description,
2645 'player_url': playerUrl,
2651 class CollegeHumorIE(InfoExtractor):
2652 """Information extractor for collegehumor.com"""
2654 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2655 IE_NAME = u'collegehumor'
2657 def report_webpage(self, video_id):
2658 """Report information extraction."""
2659 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2661 def report_extraction(self, video_id):
2662 """Report information extraction."""
2663 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2665 def _real_extract(self, url):
2666 mobj = re.match(self._VALID_URL, url)
2668 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2670 video_id = mobj.group('videoid')
2672 self.report_webpage(video_id)
2673 request = compat_urllib_request.Request(url)
2675 webpage = compat_urllib_request.urlopen(request).read()
2676 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2677 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2680 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2682 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2684 internal_video_id = m.group('internalvideoid')
2688 'internal_id': internal_video_id,
2690 'upload_date': None,
2693 self.report_extraction(video_id)
2694 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2696 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2697 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2698 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2701 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2703 videoNode = mdoc.findall('./video')[0]
2704 info['description'] = videoNode.findall('./description')[0].text
2705 info['title'] = videoNode.findall('./caption')[0].text
2706 info['url'] = videoNode.findall('./file')[0].text
2707 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2708 info['ext'] = info['url'].rpartition('.')[2]
2710 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2716 class XVideosIE(InfoExtractor):
2717 """Information extractor for xvideos.com"""
2719 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2720 IE_NAME = u'xvideos'
2722 def report_webpage(self, video_id):
2723 """Report information extraction."""
2724 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2726 def report_extraction(self, video_id):
2727 """Report information extraction."""
2728 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2730 def _real_extract(self, url):
2731 mobj = re.match(self._VALID_URL, url)
2733 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2735 video_id = mobj.group(1).decode('utf-8')
2737 self.report_webpage(video_id)
2739 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2741 webpage = compat_urllib_request.urlopen(request).read()
2742 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2743 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2746 self.report_extraction(video_id)
2750 mobj = re.search(r'flv_url=(.+?)&', webpage)
2752 self._downloader.trouble(u'ERROR: unable to extract video url')
2754 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2758 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2760 self._downloader.trouble(u'ERROR: unable to extract video title')
2762 video_title = mobj.group(1).decode('utf-8')
2765 # Extract video thumbnail
2766 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2768 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2770 video_thumbnail = mobj.group(0).decode('utf-8')
2776 'upload_date': None,
2777 'title': video_title,
2779 'thumbnail': video_thumbnail,
2780 'description': None,
2786 class SoundcloudIE(InfoExtractor):
2787 """Information extractor for soundcloud.com
2788 To access the media, the uid of the song and a stream token
2789 must be extracted from the page source and the script must make
2790 a request to media.soundcloud.com/crossdomain.xml. Then
2791 the media can be grabbed by requesting from an url composed
2792 of the stream token and uid
2795 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2796 IE_NAME = u'soundcloud'
2798 def __init__(self, downloader=None):
2799 InfoExtractor.__init__(self, downloader)
2801 def report_webpage(self, video_id):
2802 """Report information extraction."""
2803 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2805 def report_extraction(self, video_id):
2806 """Report information extraction."""
2807 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2809 def _real_extract(self, url):
2810 mobj = re.match(self._VALID_URL, url)
2812 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2815 # extract uploader (which is in the url)
2816 uploader = mobj.group(1).decode('utf-8')
2817 # extract simple title (uploader + slug of song title)
2818 slug_title = mobj.group(2).decode('utf-8')
2819 simple_title = uploader + u'-' + slug_title
2821 self.report_webpage('%s/%s' % (uploader, slug_title))
2823 request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2825 webpage = compat_urllib_request.urlopen(request).read()
2826 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2827 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2830 self.report_extraction('%s/%s' % (uploader, slug_title))
2832 # extract uid and stream token that soundcloud hands out for access
2833 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2835 video_id = mobj.group(1)
2836 stream_token = mobj.group(2)
2838 # extract unsimplified title
2839 mobj = re.search('"title":"(.*?)",', webpage)
2841 title = mobj.group(1).decode('utf-8')
2843 title = simple_title
2845 # construct media url (with uid/token)
2846 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2847 mediaURL = mediaURL % (video_id, stream_token)
2850 description = u'No description available'
2851 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2853 description = mobj.group(1)
2857 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2860 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2861 except Exception, e:
2862 self._downloader.to_stderr(compat_str(e))
2864 # for soundcloud, a request to a cross domain is required for cookies
2865 request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2868 'id': video_id.decode('utf-8'),
2870 'uploader': uploader.decode('utf-8'),
2871 'upload_date': upload_date,
2874 'description': description.decode('utf-8')
2878 class InfoQIE(InfoExtractor):
2879 """Information extractor for infoq.com"""
2881 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2884 def report_webpage(self, video_id):
2885 """Report information extraction."""
2886 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2888 def report_extraction(self, video_id):
2889 """Report information extraction."""
2890 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2892 def _real_extract(self, url):
2893 mobj = re.match(self._VALID_URL, url)
2895 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2898 self.report_webpage(url)
2900 request = compat_urllib_request.Request(url)
2902 webpage = compat_urllib_request.urlopen(request).read()
2903 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2904 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2907 self.report_extraction(url)
2911 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2913 self._downloader.trouble(u'ERROR: unable to extract video url')
2915 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2919 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2921 self._downloader.trouble(u'ERROR: unable to extract video title')
2923 video_title = mobj.group(1).decode('utf-8')
2925 # Extract description
2926 video_description = u'No description available.'
2927 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2928 if mobj is not None:
2929 video_description = mobj.group(1).decode('utf-8')
2931 video_filename = video_url.split('/')[-1]
2932 video_id, extension = video_filename.split('.')
2938 'upload_date': None,
2939 'title': video_title,
2940 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2942 'description': video_description,
2947 class MixcloudIE(InfoExtractor):
2948 """Information extractor for www.mixcloud.com"""
2949 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2950 IE_NAME = u'mixcloud'
2952 def __init__(self, downloader=None):
2953 InfoExtractor.__init__(self, downloader)
2955 def report_download_json(self, file_id):
2956 """Report JSON download."""
2957 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2959 def report_extraction(self, file_id):
2960 """Report information extraction."""
2961 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2963 def get_urls(self, jsonData, fmt, bitrate='best'):
2964 """Get urls from 'audio_formats' section in json"""
2967 bitrate_list = jsonData[fmt]
2968 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2969 bitrate = max(bitrate_list) # select highest
2971 url_list = jsonData[fmt][bitrate]
2972 except TypeError: # we have no bitrate info.
2973 url_list = jsonData[fmt]
2976 def check_urls(self, url_list):
2977 """Returns 1st active url from list"""
2978 for url in url_list:
2980 compat_urllib_request.urlopen(url)
2982 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2987 def _print_formats(self, formats):
2988 print('Available formats:')
2989 for fmt in formats.keys():
2990 for b in formats[fmt]:
2992 ext = formats[fmt][b][0]
2993 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2994 except TypeError: # we have no bitrate info
2995 ext = formats[fmt][0]
2996 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2999 def _real_extract(self, url):
3000 mobj = re.match(self._VALID_URL, url)
3002 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3004 # extract uploader & filename from url
3005 uploader = mobj.group(1).decode('utf-8')
3006 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3008 # construct API request
3009 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3010 # retrieve .json file with links to files
3011 request = compat_urllib_request.Request(file_url)
3013 self.report_download_json(file_url)
3014 jsonData = compat_urllib_request.urlopen(request).read()
3015 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3016 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3020 json_data = json.loads(jsonData)
3021 player_url = json_data['player_swf_url']
3022 formats = dict(json_data['audio_formats'])
3024 req_format = self._downloader.params.get('format', None)
3027 if self._downloader.params.get('listformats', None):
3028 self._print_formats(formats)
3031 if req_format is None or req_format == 'best':
3032 for format_param in formats.keys():
3033 url_list = self.get_urls(formats, format_param)
3035 file_url = self.check_urls(url_list)
3036 if file_url is not None:
3039 if req_format not in formats.keys():
3040 self._downloader.trouble(u'ERROR: format is not available')
3043 url_list = self.get_urls(formats, req_format)
3044 file_url = self.check_urls(url_list)
3045 format_param = req_format
3048 'id': file_id.decode('utf-8'),
3049 'url': file_url.decode('utf-8'),
3050 'uploader': uploader.decode('utf-8'),
3051 'upload_date': None,
3052 'title': json_data['name'],
3053 'ext': file_url.split('.')[-1].decode('utf-8'),
3054 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3055 'thumbnail': json_data['thumbnail_url'],
3056 'description': json_data['description'],
3057 'player_url': player_url.decode('utf-8'),
3060 class StanfordOpenClassroomIE(InfoExtractor):
3061 """Information extractor for Stanford's Open ClassRoom"""
3063 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3064 IE_NAME = u'stanfordoc'
3066 def report_download_webpage(self, objid):
3067 """Report information extraction."""
3068 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3070 def report_extraction(self, video_id):
3071 """Report information extraction."""
3072 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3074 def _real_extract(self, url):
3075 mobj = re.match(self._VALID_URL, url)
3077 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3080 if mobj.group('course') and mobj.group('video'): # A specific video
3081 course = mobj.group('course')
3082 video = mobj.group('video')
3084 'id': course + '_' + video,
3086 'upload_date': None,
3089 self.report_extraction(info['id'])
3090 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3091 xmlUrl = baseUrl + video + '.xml'
3093 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3094 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3095 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3097 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3099 info['title'] = mdoc.findall('./title')[0].text
3100 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3102 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3104 info['ext'] = info['url'].rpartition('.')[2]
3106 elif mobj.group('course'): # A course page
3107 course = mobj.group('course')
3112 'upload_date': None,
3115 self.report_download_webpage(info['id'])
3117 coursepage = compat_urllib_request.urlopen(url).read()
3118 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3119 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3122 m = re.search('<h1>([^<]+)</h1>', coursepage)
3124 info['title'] = unescapeHTML(m.group(1))
3126 info['title'] = info['id']
3128 m = re.search('<description>([^<]+)</description>', coursepage)
3130 info['description'] = unescapeHTML(m.group(1))
3132 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3135 'type': 'reference',
3136 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3140 for entry in info['list']:
3141 assert entry['type'] == 'reference'
3142 results += self.extract(entry['url'])
3147 'id': 'Stanford OpenClassroom',
3150 'upload_date': None,
3153 self.report_download_webpage(info['id'])
3154 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3156 rootpage = compat_urllib_request.urlopen(rootURL).read()
3157 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3158 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3161 info['title'] = info['id']
3163 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3166 'type': 'reference',
3167 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3172 for entry in info['list']:
3173 assert entry['type'] == 'reference'
3174 results += self.extract(entry['url'])
3177 class MTVIE(InfoExtractor):
3178 """Information extractor for MTV.com"""
3180 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3183 def report_webpage(self, video_id):
3184 """Report information extraction."""
3185 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3187 def report_extraction(self, video_id):
3188 """Report information extraction."""
3189 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3191 def _real_extract(self, url):
3192 mobj = re.match(self._VALID_URL, url)
3194 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3196 if not mobj.group('proto'):
3197 url = 'http://' + url
3198 video_id = mobj.group('videoid')
3199 self.report_webpage(video_id)
3201 request = compat_urllib_request.Request(url)
3203 webpage = compat_urllib_request.urlopen(request).read()
3204 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3205 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3208 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3210 self._downloader.trouble(u'ERROR: unable to extract song name')
3212 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3213 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3215 self._downloader.trouble(u'ERROR: unable to extract performer')
3217 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3218 video_title = performer + ' - ' + song_name
3220 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3222 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3224 mtvn_uri = mobj.group(1)
3226 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3228 self._downloader.trouble(u'ERROR: unable to extract content id')
3230 content_id = mobj.group(1)
3232 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3233 self.report_extraction(video_id)
3234 request = compat_urllib_request.Request(videogen_url)
3236 metadataXml = compat_urllib_request.urlopen(request).read()
3237 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3238 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3241 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3242 renditions = mdoc.findall('.//rendition')
3244 # For now, always pick the highest quality.
3245 rendition = renditions[-1]
3248 _,_,ext = rendition.attrib['type'].partition('/')
3249 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3250 video_url = rendition.find('./src').text
3252 self._downloader.trouble('Invalid rendition field.')
3258 'uploader': performer,
3259 'upload_date': None,
3260 'title': video_title,
3268 class YoukuIE(InfoExtractor):
3270 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3273 def __init__(self, downloader=None):
3274 InfoExtractor.__init__(self, downloader)
3276 def report_download_webpage(self, file_id):
3277 """Report webpage download."""
3278 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3280 def report_extraction(self, file_id):
3281 """Report information extraction."""
3282 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3285 nowTime = int(time.time() * 1000)
3286 random1 = random.randint(1000,1998)
3287 random2 = random.randint(1000,9999)
3289 return "%d%d%d" %(nowTime,random1,random2)
3291 def _get_file_ID_mix_string(self, seed):
3293 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3295 for i in range(len(source)):
3296 seed = (seed * 211 + 30031 ) % 65536
3297 index = math.floor(seed / 65536 * len(source) )
3298 mixed.append(source[int(index)])
3299 source.remove(source[int(index)])
3300 #return ''.join(mixed)
3303 def _get_file_id(self, fileId, seed):
3304 mixed = self._get_file_ID_mix_string(seed)
3305 ids = fileId.split('*')
3309 realId.append(mixed[int(ch)])
3310 return ''.join(realId)
3312 def _real_extract(self, url):
3313 mobj = re.match(self._VALID_URL, url)
3315 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3317 video_id = mobj.group('ID')
3319 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3321 request = compat_urllib_request.Request(info_url, None, std_headers)
3323 self.report_download_webpage(video_id)
3324 jsondata = compat_urllib_request.urlopen(request).read()
3325 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3326 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3329 self.report_extraction(video_id)
3331 config = json.loads(jsondata)
3333 video_title = config['data'][0]['title']
3334 seed = config['data'][0]['seed']
3336 format = self._downloader.params.get('format', None)
3337 supported_format = config['data'][0]['streamfileids'].keys()
3339 if format is None or format == 'best':
3340 if 'hd2' in supported_format:
3345 elif format == 'worst':
3353 fileid = config['data'][0]['streamfileids'][format]
3354 seg_number = len(config['data'][0]['segs'][format])
3357 for i in xrange(seg_number):
3358 keys.append(config['data'][0]['segs'][format][i]['k'])
3361 #youku only could be viewed from mainland china
3363 self._downloader.trouble(u'ERROR: unable to extract info section')
3367 sid = self._gen_sid()
3368 fileid = self._get_file_id(fileid, seed)
3370 #column 8,9 of fileid represent the segment number
3371 #fileid[7:9] should be changed
3372 for index, key in enumerate(keys):
3374 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3375 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3378 'id': '%s_part%02d' % (video_id, index),
3379 'url': download_url,
3381 'upload_date': None,
3382 'title': video_title,
3385 files_info.append(info)
3390 class XNXXIE(InfoExtractor):
3391 """Information extractor for xnxx.com"""
3393 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3395 VIDEO_URL_RE = r'flv_url=(.*?)&'
3396 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3397 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3399 def report_webpage(self, video_id):
3400 """Report information extraction"""
3401 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3403 def report_extraction(self, video_id):
3404 """Report information extraction"""
3405 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3407 def _real_extract(self, url):
3408 mobj = re.match(self._VALID_URL, url)
3410 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3412 video_id = mobj.group(1).decode('utf-8')
3414 self.report_webpage(video_id)
3416 # Get webpage content
3418 webpage = compat_urllib_request.urlopen(url).read()
3419 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3420 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3423 result = re.search(self.VIDEO_URL_RE, webpage)
3425 self._downloader.trouble(u'ERROR: unable to extract video url')
3427 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3429 result = re.search(self.VIDEO_TITLE_RE, webpage)
3431 self._downloader.trouble(u'ERROR: unable to extract video title')
3433 video_title = result.group(1).decode('utf-8')
3435 result = re.search(self.VIDEO_THUMB_RE, webpage)
3437 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3439 video_thumbnail = result.group(1).decode('utf-8')
3445 'upload_date': None,
3446 'title': video_title,
3448 'thumbnail': video_thumbnail,
3449 'description': None,
3453 class GooglePlusIE(InfoExtractor):
3454 """Information extractor for plus.google.com."""
3456 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3457 IE_NAME = u'plus.google'
3459 def __init__(self, downloader=None):
3460 InfoExtractor.__init__(self, downloader)
3462 def report_extract_entry(self, url):
3463 """Report downloading extry"""
3464 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3466 def report_date(self, upload_date):
3467 """Report downloading extry"""
3468 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3470 def report_uploader(self, uploader):
3471 """Report downloading extry"""
3472 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3474 def report_title(self, video_title):
3475 """Report downloading extry"""
3476 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3478 def report_extract_vid_page(self, video_page):
3479 """Report information extraction."""
3480 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3482 def _real_extract(self, url):
3483 # Extract id from URL
3484 mobj = re.match(self._VALID_URL, url)
3486 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3489 post_url = mobj.group(0)
3490 video_id = mobj.group(2)
3492 video_extension = 'flv'
3494 # Step 1, Retrieve post webpage to extract further information
3495 self.report_extract_entry(post_url)
3496 request = compat_urllib_request.Request(post_url)
3498 webpage = compat_urllib_request.urlopen(request).read()
3499 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3500 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3503 # Extract update date
3505 pattern = 'title="Timestamp">(.*?)</a>'
3506 mobj = re.search(pattern, webpage)
3508 upload_date = mobj.group(1)
3509 # Convert timestring to a format suitable for filename
3510 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3511 upload_date = upload_date.strftime('%Y%m%d')
3512 self.report_date(upload_date)
3516 pattern = r'rel\="author".*?>(.*?)</a>'
3517 mobj = re.search(pattern, webpage)
3519 uploader = mobj.group(1)
3520 self.report_uploader(uploader)
3523 # Get the first line for title
3525 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3526 mobj = re.search(pattern, webpage)
3528 video_title = mobj.group(1)
3529 self.report_title(video_title)
3531 # Step 2, Stimulate clicking the image box to launch video
3532 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3533 mobj = re.search(pattern, webpage)
3535 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3537 video_page = mobj.group(1)
3538 request = compat_urllib_request.Request(video_page)
3540 webpage = compat_urllib_request.urlopen(request).read()
3541 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3542 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3544 self.report_extract_vid_page(video_page)
3547 # Extract video links on video page
3548 """Extract video links of all sizes"""
3549 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3550 mobj = re.findall(pattern, webpage)
3552 self._downloader.trouble(u'ERROR: unable to extract video links')
3554 # Sort in resolution
3555 links = sorted(mobj)
3557 # Choose the lowest of the sort, i.e. highest resolution
3558 video_url = links[-1]
3559 # Only get the url. The resolution part in the tuple has no use anymore
3560 video_url = video_url[-1]
3561 # Treat escaped \u0026 style hex
3562 video_url = unicode(video_url, "unicode_escape")
3566 'id': video_id.decode('utf-8'),
3568 'uploader': uploader.decode('utf-8'),
3569 'upload_date': upload_date.decode('utf-8'),
3570 'title': video_title.decode('utf-8'),
3571 'ext': video_extension.decode('utf-8'),