2 # -*- coding: utf-8 -*-
12 import xml.etree.ElementTree
15 from urlparse import parse_qs
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 uploader: Nickname of the video uploader, unescaped.
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The .srt file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
50 The fields should all be Unicode strings.
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
56 _real_extract() must return a *list* of information dictionaries as
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
67 def __init__(self, downloader=None):
68 """Constructor. Receives an optional downloader."""
70 self.set_downloader(downloader)
72 def suitable(self, url):
73 """Receives a URL and returns True if suitable for this IE."""
74 return re.match(self._VALID_URL, url) is not None
77 """Getter method for _WORKING."""
81 """Initializes an instance (authentication, etc)."""
83 self._real_initialize()
86 def extract(self, url):
87 """Extracts URL information and returns it in list of dicts."""
89 return self._real_extract(url)
91 def set_downloader(self, downloader):
92 """Sets the downloader for this IE."""
93 self._downloader = downloader
95 def _real_initialize(self):
96 """Real initialization process. Redefine in subclasses."""
99 def _real_extract(self, url):
100 """Real extraction process. Redefine in subclasses."""
104 class YoutubeIE(InfoExtractor):
105 """Information extractor for youtube.com."""
109 (?:https?://)? # http(s):// (optional)
110 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
112 (?:.*?\#/)? # handle anchor (#/) redirect urls
113 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
114 (?: # the various things that can precede the ID:
115 (?:(?:v|embed|e)/) # v/ or embed/ or e/
116 |(?: # or the v= param in all its forms
117 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118 (?:\?|\#!?) # the params delimiter ? or # or #!
119 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
122 )? # optional -> youtube.com/xxxx is OK
123 )? # all until now is optional -> you can pass the naked ID
124 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
125 (?(1).+)? # if we found the ID, everything can follow
127 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131 _NETRC_MACHINE = 'youtube'
132 # Listed in order of quality
133 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135 _video_extensions = {
141 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
147 _video_dimensions = {
165 def suitable(self, url):
166 """Receives a URL and returns True if suitable for this IE."""
167 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
169 def report_lang(self):
170 """Report attempt to set language."""
171 self._downloader.to_screen(u'[youtube] Setting language')
173 def report_login(self):
174 """Report attempt to log in."""
175 self._downloader.to_screen(u'[youtube] Logging in')
177 def report_age_confirmation(self):
178 """Report attempt to confirm age."""
179 self._downloader.to_screen(u'[youtube] Confirming age')
181 def report_video_webpage_download(self, video_id):
182 """Report attempt to download video webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
185 def report_video_info_webpage_download(self, video_id):
186 """Report attempt to download video info webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
189 def report_video_subtitles_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
193 def report_information_extraction(self, video_id):
194 """Report attempt to extract video information."""
195 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
197 def report_unavailable_format(self, video_id, format):
198 """Report extracted video URL."""
199 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
201 def report_rtmp_download(self):
202 """Indicate the download will use the RTMP protocol."""
203 self._downloader.to_screen(u'[youtube] RTMP download detected')
205 def _closed_captions_xml_to_srt(self, xml_string):
207 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208 # TODO parse xml instead of regex
209 for n, (start, dur_tag, dur, caption) in enumerate(texts):
210 if not dur: dur = '4'
212 end = start + float(dur)
213 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215 caption = unescapeHTML(caption)
216 caption = unescapeHTML(caption) # double cycle, intentional
217 srt += str(n+1) + '\n'
218 srt += start + ' --> ' + end + '\n'
219 srt += caption + '\n\n'
222 def _print_formats(self, formats):
223 print('Available formats:')
225 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
227 def _real_initialize(self):
228 if self._downloader is None:
233 downloader_params = self._downloader.params
235 # Attempt to use provided username and password or .netrc data
236 if downloader_params.get('username', None) is not None:
237 username = downloader_params['username']
238 password = downloader_params['password']
239 elif downloader_params.get('usenetrc', False):
241 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
246 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247 except (IOError, netrc.NetrcParseError) as err:
248 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
252 request = compat_urllib_request.Request(self._LANG_URL)
255 compat_urllib_request.urlopen(request).read()
256 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
257 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
260 # No authentication to be performed
266 'current_form': 'loginForm',
268 'action_login': 'Log In',
269 'username': username,
270 'password': password,
272 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
275 login_results = compat_urllib_request.urlopen(request).read()
276 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
279 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
280 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
286 'action_confirm': 'Confirm',
288 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
290 self.report_age_confirmation()
291 age_results = compat_urllib_request.urlopen(request).read()
292 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
293 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
296 def _real_extract(self, url):
297 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298 mobj = re.search(self._NEXT_URL_RE, url)
300 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
302 # Extract video id from URL
303 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
305 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
307 video_id = mobj.group(2)
310 self.report_video_webpage_download(video_id)
311 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
313 video_webpage = compat_urllib_request.urlopen(request).read()
314 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
315 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
318 # Attempt to extract SWF player URL
319 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
321 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
326 self.report_video_info_webpage_download(video_id)
327 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
328 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
329 % (video_id, el_type))
330 request = compat_urllib_request.Request(video_info_url)
332 video_info_webpage = compat_urllib_request.urlopen(request).read()
333 video_info = parse_qs(video_info_webpage)
334 if 'token' in video_info:
336 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
337 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
339 if 'token' not in video_info:
340 if 'reason' in video_info:
341 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
343 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
346 # Check for "rental" videos
347 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
348 self._downloader.trouble(u'ERROR: "rental" videos not supported')
351 # Start extracting information
352 self.report_information_extraction(video_id)
355 if 'author' not in video_info:
356 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
358 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
361 if 'title' not in video_info:
362 self._downloader.trouble(u'ERROR: unable to extract video title')
364 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
365 video_title = video_title.decode('utf-8')
368 if 'thumbnail_url' not in video_info:
369 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
371 else: # don't panic if we can't find it
372 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
376 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
378 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
379 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
380 for expression in format_expressions:
382 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
387 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
388 if video_description: video_description = clean_html(video_description)
389 else: video_description = ''
392 video_subtitles = None
393 if self._downloader.params.get('writesubtitles', False):
395 self.report_video_subtitles_download(video_id)
396 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
398 srt_list = compat_urllib_request.urlopen(request).read()
399 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
400 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
401 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
402 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
403 if not srt_lang_list:
404 raise Trouble(u'WARNING: video has no closed captions')
405 if self._downloader.params.get('subtitleslang', False):
406 srt_lang = self._downloader.params.get('subtitleslang')
407 elif 'en' in srt_lang_list:
410 srt_lang = srt_lang_list.keys()[0]
411 if not srt_lang in srt_lang_list:
412 raise Trouble(u'WARNING: no closed captions found in the specified language')
413 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
415 srt_xml = compat_urllib_request.urlopen(request).read()
416 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
417 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
419 raise Trouble(u'WARNING: unable to download video subtitles')
420 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
421 except Trouble as trouble:
422 self._downloader.trouble(trouble[0])
424 if 'length_seconds' not in video_info:
425 self._downloader.trouble(u'WARNING: unable to extract video duration')
428 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
431 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
433 # Decide which formats to download
434 req_format = self._downloader.params.get('format', None)
436 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
437 self.report_rtmp_download()
438 video_url_list = [(None, video_info['conn'][0])]
439 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
440 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
441 url_data = [parse_qs(uds) for uds in url_data_strs]
442 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
443 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
445 format_limit = self._downloader.params.get('format_limit', None)
446 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
447 if format_limit is not None and format_limit in available_formats:
448 format_list = available_formats[available_formats.index(format_limit):]
450 format_list = available_formats
451 existing_formats = [x for x in format_list if x in url_map]
452 if len(existing_formats) == 0:
453 self._downloader.trouble(u'ERROR: no known formats available for video')
455 if self._downloader.params.get('listformats', None):
456 self._print_formats(existing_formats)
458 if req_format is None or req_format == 'best':
459 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
460 elif req_format == 'worst':
461 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
462 elif req_format in ('-1', 'all'):
463 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
465 # Specific formats. We pick the first in a slash-delimeted sequence.
466 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
467 req_formats = req_format.split('/')
468 video_url_list = None
469 for rf in req_formats:
471 video_url_list = [(rf, url_map[rf])]
473 if video_url_list is None:
474 self._downloader.trouble(u'ERROR: requested format not available')
477 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
481 for format_param, video_real_url in video_url_list:
483 video_extension = self._video_extensions.get(format_param, 'flv')
485 video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
486 self._video_dimensions.get(format_param, '???'))
489 'id': video_id.decode('utf-8'),
490 'url': video_real_url.decode('utf-8'),
491 'uploader': video_uploader.decode('utf-8'),
492 'upload_date': upload_date,
493 'title': video_title,
494 'ext': video_extension.decode('utf-8'),
495 'format': video_format,
496 'thumbnail': video_thumbnail.decode('utf-8'),
497 'description': video_description,
498 'player_url': player_url,
499 'subtitles': video_subtitles,
500 'duration': video_duration
505 class MetacafeIE(InfoExtractor):
506 """Information Extractor for metacafe.com."""
508 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
509 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
510 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
511 IE_NAME = u'metacafe'
513 def __init__(self, downloader=None):
514 InfoExtractor.__init__(self, downloader)
516 def report_disclaimer(self):
517 """Report disclaimer retrieval."""
518 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
520 def report_age_confirmation(self):
521 """Report attempt to confirm age."""
522 self._downloader.to_screen(u'[metacafe] Confirming age')
524 def report_download_webpage(self, video_id):
525 """Report webpage download."""
526 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
528 def report_extraction(self, video_id):
529 """Report information extraction."""
530 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
532 def _real_initialize(self):
533 # Retrieve disclaimer
534 request = compat_urllib_request.Request(self._DISCLAIMER)
536 self.report_disclaimer()
537 disclaimer = compat_urllib_request.urlopen(request).read()
538 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
539 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
545 'submit': "Continue - I'm over 18",
547 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
549 self.report_age_confirmation()
550 disclaimer = compat_urllib_request.urlopen(request).read()
551 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
552 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
555 def _real_extract(self, url):
556 # Extract id and simplified title from URL
557 mobj = re.match(self._VALID_URL, url)
559 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
562 video_id = mobj.group(1)
564 # Check if video comes from YouTube
565 mobj2 = re.match(r'^yt-(.*)$', video_id)
566 if mobj2 is not None:
567 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
570 # Retrieve video webpage to extract further information
571 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
573 self.report_download_webpage(video_id)
574 webpage = compat_urllib_request.urlopen(request).read()
575 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
576 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
579 # Extract URL, uploader and title from webpage
580 self.report_extraction(video_id)
581 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
583 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
584 video_extension = mediaURL[-3:]
586 # Extract gdaKey if available
587 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
591 gdaKey = mobj.group(1)
592 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
594 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
596 self._downloader.trouble(u'ERROR: unable to extract media URL')
598 vardict = parse_qs(mobj.group(1))
599 if 'mediaData' not in vardict:
600 self._downloader.trouble(u'ERROR: unable to extract media URL')
602 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
604 self._downloader.trouble(u'ERROR: unable to extract media URL')
606 mediaURL = mobj.group(1).replace('\\/', '/')
607 video_extension = mediaURL[-3:]
608 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
610 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
612 self._downloader.trouble(u'ERROR: unable to extract title')
614 video_title = mobj.group(1).decode('utf-8')
616 mobj = re.search(r'submitter=(.*?);', webpage)
618 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
620 video_uploader = mobj.group(1)
623 'id': video_id.decode('utf-8'),
624 'url': video_url.decode('utf-8'),
625 'uploader': video_uploader.decode('utf-8'),
627 'title': video_title,
628 'ext': video_extension.decode('utf-8'),
632 class DailymotionIE(InfoExtractor):
633 """Information Extractor for Dailymotion"""
635 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
636 IE_NAME = u'dailymotion'
638 def __init__(self, downloader=None):
639 InfoExtractor.__init__(self, downloader)
641 def report_download_webpage(self, video_id):
642 """Report webpage download."""
643 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
645 def report_extraction(self, video_id):
646 """Report information extraction."""
647 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
649 def _real_extract(self, url):
650 # Extract id and simplified title from URL
651 mobj = re.match(self._VALID_URL, url)
653 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
656 video_id = mobj.group(1).split('_')[0].split('?')[0]
658 video_extension = 'mp4'
660 # Retrieve video webpage to extract further information
661 request = compat_urllib_request.Request(url)
662 request.add_header('Cookie', 'family_filter=off')
664 self.report_download_webpage(video_id)
665 webpage = compat_urllib_request.urlopen(request).read()
666 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
667 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
670 # Extract URL, uploader and title from webpage
671 self.report_extraction(video_id)
672 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
674 self._downloader.trouble(u'ERROR: unable to extract media URL')
676 flashvars = compat_urllib_parse.unquote(mobj.group(1))
678 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
681 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
684 self._downloader.trouble(u'ERROR: unable to extract video URL')
687 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
689 self._downloader.trouble(u'ERROR: unable to extract video URL')
692 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
694 # TODO: support choosing qualities
696 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
698 self._downloader.trouble(u'ERROR: unable to extract title')
700 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
702 video_uploader = None
703 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
705 # lookin for official user
706 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
707 if mobj_official is None:
708 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
710 video_uploader = mobj_official.group(1)
712 video_uploader = mobj.group(1)
714 video_upload_date = None
715 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
717 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
720 'id': video_id.decode('utf-8'),
721 'url': video_url.decode('utf-8'),
722 'uploader': video_uploader.decode('utf-8'),
723 'upload_date': video_upload_date,
724 'title': video_title,
725 'ext': video_extension.decode('utf-8'),
729 class GoogleIE(InfoExtractor):
730 """Information extractor for video.google.com."""
732 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
733 IE_NAME = u'video.google'
735 def __init__(self, downloader=None):
736 InfoExtractor.__init__(self, downloader)
738 def report_download_webpage(self, video_id):
739 """Report webpage download."""
740 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
742 def report_extraction(self, video_id):
743 """Report information extraction."""
744 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
746 def _real_extract(self, url):
747 # Extract id from URL
748 mobj = re.match(self._VALID_URL, url)
750 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
753 video_id = mobj.group(1)
755 video_extension = 'mp4'
757 # Retrieve video webpage to extract further information
758 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
760 self.report_download_webpage(video_id)
761 webpage = compat_urllib_request.urlopen(request).read()
762 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
763 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
766 # Extract URL, uploader, and title from webpage
767 self.report_extraction(video_id)
768 mobj = re.search(r"download_url:'([^']+)'", webpage)
770 video_extension = 'flv'
771 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
773 self._downloader.trouble(u'ERROR: unable to extract media URL')
775 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
776 mediaURL = mediaURL.replace('\\x3d', '\x3d')
777 mediaURL = mediaURL.replace('\\x26', '\x26')
781 mobj = re.search(r'<title>(.*)</title>', webpage)
783 self._downloader.trouble(u'ERROR: unable to extract title')
785 video_title = mobj.group(1).decode('utf-8')
787 # Extract video description
788 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
790 self._downloader.trouble(u'ERROR: unable to extract video description')
792 video_description = mobj.group(1).decode('utf-8')
793 if not video_description:
794 video_description = 'No description available.'
796 # Extract video thumbnail
797 if self._downloader.params.get('forcethumbnail', False):
798 request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
800 webpage = compat_urllib_request.urlopen(request).read()
801 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
802 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
804 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
806 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
808 video_thumbnail = mobj.group(1)
809 else: # we need something to pass to process_info
813 'id': video_id.decode('utf-8'),
814 'url': video_url.decode('utf-8'),
817 'title': video_title,
818 'ext': video_extension.decode('utf-8'),
822 class PhotobucketIE(InfoExtractor):
823 """Information extractor for photobucket.com."""
825 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
826 IE_NAME = u'photobucket'
828 def __init__(self, downloader=None):
829 InfoExtractor.__init__(self, downloader)
831 def report_download_webpage(self, video_id):
832 """Report webpage download."""
833 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
835 def report_extraction(self, video_id):
836 """Report information extraction."""
837 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
839 def _real_extract(self, url):
840 # Extract id from URL
841 mobj = re.match(self._VALID_URL, url)
843 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
846 video_id = mobj.group(1)
848 video_extension = 'flv'
850 # Retrieve video webpage to extract further information
851 request = compat_urllib_request.Request(url)
853 self.report_download_webpage(video_id)
854 webpage = compat_urllib_request.urlopen(request).read()
855 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
856 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
859 # Extract URL, uploader, and title from webpage
860 self.report_extraction(video_id)
861 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
863 self._downloader.trouble(u'ERROR: unable to extract media URL')
865 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
869 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
871 self._downloader.trouble(u'ERROR: unable to extract title')
873 video_title = mobj.group(1).decode('utf-8')
875 video_uploader = mobj.group(2).decode('utf-8')
878 'id': video_id.decode('utf-8'),
879 'url': video_url.decode('utf-8'),
880 'uploader': video_uploader,
882 'title': video_title,
883 'ext': video_extension.decode('utf-8'),
887 class YahooIE(InfoExtractor):
888 """Information extractor for video.yahoo.com."""
890 # _VALID_URL matches all Yahoo! Video URLs
891 # _VPAGE_URL matches only the extractable '/watch/' URLs
892 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
893 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
894 IE_NAME = u'video.yahoo'
896 def __init__(self, downloader=None):
897 InfoExtractor.__init__(self, downloader)
899 def report_download_webpage(self, video_id):
900 """Report webpage download."""
901 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
903 def report_extraction(self, video_id):
904 """Report information extraction."""
905 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
907 def _real_extract(self, url, new_video=True):
908 # Extract ID from URL
909 mobj = re.match(self._VALID_URL, url)
911 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
914 video_id = mobj.group(2)
915 video_extension = 'flv'
917 # Rewrite valid but non-extractable URLs as
918 # extractable English language /watch/ URLs
919 if re.match(self._VPAGE_URL, url) is None:
920 request = compat_urllib_request.Request(url)
922 webpage = compat_urllib_request.urlopen(request).read()
923 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
924 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
927 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
929 self._downloader.trouble(u'ERROR: Unable to extract id field')
931 yahoo_id = mobj.group(1)
933 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
935 self._downloader.trouble(u'ERROR: Unable to extract vid field')
937 yahoo_vid = mobj.group(1)
939 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
940 return self._real_extract(url, new_video=False)
942 # Retrieve video webpage to extract further information
943 request = compat_urllib_request.Request(url)
945 self.report_download_webpage(video_id)
946 webpage = compat_urllib_request.urlopen(request).read()
947 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
948 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
951 # Extract uploader and title from webpage
952 self.report_extraction(video_id)
953 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
955 self._downloader.trouble(u'ERROR: unable to extract video title')
957 video_title = mobj.group(1).decode('utf-8')
959 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
961 self._downloader.trouble(u'ERROR: unable to extract video uploader')
963 video_uploader = mobj.group(1).decode('utf-8')
965 # Extract video thumbnail
966 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
968 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
970 video_thumbnail = mobj.group(1).decode('utf-8')
972 # Extract video description
973 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
975 self._downloader.trouble(u'ERROR: unable to extract video description')
977 video_description = mobj.group(1).decode('utf-8')
978 if not video_description:
979 video_description = 'No description available.'
981 # Extract video height and width
982 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
984 self._downloader.trouble(u'ERROR: unable to extract video height')
986 yv_video_height = mobj.group(1)
988 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
990 self._downloader.trouble(u'ERROR: unable to extract video width')
992 yv_video_width = mobj.group(1)
994 # Retrieve video playlist to extract media URL
995 # I'm not completely sure what all these options are, but we
996 # seem to need most of them, otherwise the server sends a 401.
997 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
998 yv_bitrate = '700' # according to Wikipedia this is hard-coded
999 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1000 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1001 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1003 self.report_download_webpage(video_id)
1004 webpage = compat_urllib_request.urlopen(request).read()
1005 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1006 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1009 # Extract media URL from playlist XML
1010 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1012 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1014 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1015 video_url = unescapeHTML(video_url)
1018 'id': video_id.decode('utf-8'),
1020 'uploader': video_uploader,
1021 'upload_date': None,
1022 'title': video_title,
1023 'ext': video_extension.decode('utf-8'),
1024 'thumbnail': video_thumbnail.decode('utf-8'),
1025 'description': video_description,
1029 class VimeoIE(InfoExtractor):
1030 """Information extractor for vimeo.com."""
1032 # _VALID_URL matches Vimeo URLs
1033 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1036 def __init__(self, downloader=None):
1037 InfoExtractor.__init__(self, downloader)
1039 def report_download_webpage(self, video_id):
1040 """Report webpage download."""
1041 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1043 def report_extraction(self, video_id):
1044 """Report information extraction."""
1045 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1047 def _real_extract(self, url, new_video=True):
1048 # Extract ID from URL
1049 mobj = re.match(self._VALID_URL, url)
1051 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1054 video_id = mobj.group(1)
1056 # Retrieve video webpage to extract further information
1057 request = compat_urllib_request.Request(url, None, std_headers)
1059 self.report_download_webpage(video_id)
1060 webpage = compat_urllib_request.urlopen(request).read()
1061 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1062 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1065 # Now we begin extracting as much information as we can from what we
1066 # retrieved. First we extract the information common to all extractors,
1067 # and latter we extract those that are Vimeo specific.
1068 self.report_extraction(video_id)
1070 # Extract the config JSON
1071 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1073 config = json.loads(config)
1075 self._downloader.trouble(u'ERROR: unable to extract info section')
1079 video_title = config["video"]["title"]
1082 video_uploader = config["video"]["owner"]["name"]
1084 # Extract video thumbnail
1085 video_thumbnail = config["video"]["thumbnail"]
1087 # Extract video description
1088 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089 if video_description: video_description = clean_html(video_description)
1090 else: video_description = ''
1092 # Extract upload date
1093 video_upload_date = None
1094 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095 if mobj is not None:
1096 video_upload_date = mobj.group(1)
1098 # Vimeo specific: extract request signature and timestamp
1099 sig = config['request']['signature']
1100 timestamp = config['request']['timestamp']
1102 # Vimeo specific: extract video codec and quality information
1103 # First consider quality, then codecs, then take everything
1104 # TODO bind to format param
1105 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106 files = { 'hd': [], 'sd': [], 'other': []}
1107 for codec_name, codec_extension in codecs:
1108 if codec_name in config["video"]["files"]:
1109 if 'hd' in config["video"]["files"][codec_name]:
1110 files['hd'].append((codec_name, codec_extension, 'hd'))
1111 elif 'sd' in config["video"]["files"][codec_name]:
1112 files['sd'].append((codec_name, codec_extension, 'sd'))
1114 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1116 for quality in ('hd', 'sd', 'other'):
1117 if len(files[quality]) > 0:
1118 video_quality = files[quality][0][2]
1119 video_codec = files[quality][0][0]
1120 video_extension = files[quality][0][1]
1121 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1124 self._downloader.trouble(u'ERROR: no known codec found')
1127 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1133 'uploader': video_uploader,
1134 'upload_date': video_upload_date,
1135 'title': video_title,
1136 'ext': video_extension,
1137 'thumbnail': video_thumbnail,
1138 'description': video_description,
1142 class ArteTvIE(InfoExtractor):
1143 """arte.tv information extractor."""
1145 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1146 _LIVE_URL = r'index-[0-9]+\.html$'
1148 IE_NAME = u'arte.tv'
1150 def __init__(self, downloader=None):
1151 InfoExtractor.__init__(self, downloader)
1153 def report_download_webpage(self, video_id):
1154 """Report webpage download."""
1155 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1157 def report_extraction(self, video_id):
1158 """Report information extraction."""
1159 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1161 def fetch_webpage(self, url):
1162 self._downloader.increment_downloads()
1163 request = compat_urllib_request.Request(url)
1165 self.report_download_webpage(url)
1166 webpage = compat_urllib_request.urlopen(request).read()
1167 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1168 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1170 except ValueError as err:
1171 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1175 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1176 page = self.fetch_webpage(url)
1177 mobj = re.search(regex, page, regexFlags)
1181 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1184 for (i, key, err) in matchTuples:
1185 if mobj.group(i) is None:
1186 self._downloader.trouble(err)
1189 info[key] = mobj.group(i)
1193 def extractLiveStream(self, url):
1194 video_lang = url.split('/')[-4]
1195 info = self.grep_webpage(
1197 r'src="(.*?/videothek_js.*?\.js)',
1200 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1203 http_host = url.split('/')[2]
1204 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1205 info = self.grep_webpage(
1207 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1208 '(http://.*?\.swf).*?' +
1212 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1213 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1214 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1217 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1219 def extractPlus7Stream(self, url):
1220 video_lang = url.split('/')[-3]
1221 info = self.grep_webpage(
1223 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1226 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1229 next_url = compat_urllib_parse.unquote(info.get('url'))
1230 info = self.grep_webpage(
1232 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1235 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1238 next_url = compat_urllib_parse.unquote(info.get('url'))
1240 info = self.grep_webpage(
1242 r'<video id="(.*?)".*?>.*?' +
1243 '<name>(.*?)</name>.*?' +
1244 '<dateVideo>(.*?)</dateVideo>.*?' +
1245 '<url quality="hd">(.*?)</url>',
1248 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1249 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1250 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1251 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1256 'id': info.get('id'),
1257 'url': compat_urllib_parse.unquote(info.get('url')),
1258 'uploader': u'arte.tv',
1259 'upload_date': info.get('date'),
1260 'title': info.get('title'),
1266 def _real_extract(self, url):
1267 video_id = url.split('/')[-1]
1268 self.report_extraction(video_id)
1270 if re.search(self._LIVE_URL, video_id) is not None:
1271 self.extractLiveStream(url)
1274 info = self.extractPlus7Stream(url)
1279 class GenericIE(InfoExtractor):
1280 """Generic last-resort information extractor."""
1283 IE_NAME = u'generic'
1285 def __init__(self, downloader=None):
1286 InfoExtractor.__init__(self, downloader)
1288 def report_download_webpage(self, video_id):
1289 """Report webpage download."""
1290 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1291 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1293 def report_extraction(self, video_id):
1294 """Report information extraction."""
1295 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1297 def report_following_redirect(self, new_url):
1298 """Report information extraction."""
1299 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1301 def _test_redirect(self, url):
1302 """Check if it is a redirect, like url shorteners, in case restart chain."""
1303 class HeadRequest(compat_urllib_request.Request):
1304 def get_method(self):
1307 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1309 Subclass the HTTPRedirectHandler to make it use our
1310 HeadRequest also on the redirected URL
1312 def redirect_request(self, req, fp, code, msg, headers, newurl):
1313 if code in (301, 302, 303, 307):
1314 newurl = newurl.replace(' ', '%20')
1315 newheaders = dict((k,v) for k,v in req.headers.items()
1316 if k.lower() not in ("content-length", "content-type"))
1317 return HeadRequest(newurl,
1319 origin_req_host=req.get_origin_req_host(),
1322 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1324 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1326 Fallback to GET if HEAD is not allowed (405 HTTP error)
1328 def http_error_405(self, req, fp, code, msg, headers):
1332 newheaders = dict((k,v) for k,v in req.headers.items()
1333 if k.lower() not in ("content-length", "content-type"))
1334 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1336 origin_req_host=req.get_origin_req_host(),
1340 opener = compat_urllib_request.OpenerDirector()
1341 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1342 HTTPMethodFallback, HEADRedirectHandler,
1343 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1344 opener.add_handler(handler())
1346 response = opener.open(HeadRequest(url))
1347 new_url = response.geturl()
1352 self.report_following_redirect(new_url)
1353 self._downloader.download([new_url])
1356 def _real_extract(self, url):
1357 if self._test_redirect(url): return
1359 video_id = url.split('/')[-1]
1360 request = compat_urllib_request.Request(url)
1362 self.report_download_webpage(video_id)
1363 webpage = compat_urllib_request.urlopen(request).read()
1364 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1365 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1367 except ValueError as err:
1368 # since this is the last-resort InfoExtractor, if
1369 # this error is thrown, it'll be thrown here
1370 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1373 self.report_extraction(video_id)
1374 # Start with something easy: JW Player in SWFObject
1375 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1377 # Broaden the search a little bit
1378 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1380 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1383 # It's possible that one of the regexes
1384 # matched, but returned an empty group:
1385 if mobj.group(1) is None:
1386 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1389 video_url = compat_urllib_parse.unquote(mobj.group(1))
1390 video_id = os.path.basename(video_url)
1392 # here's a fun little line of code for you:
1393 video_extension = os.path.splitext(video_id)[1][1:]
1394 video_id = os.path.splitext(video_id)[0]
1396 # it's tempting to parse this further, but you would
1397 # have to take into account all the variations like
1398 # Video Title - Site Name
1399 # Site Name | Video Title
1400 # Video Title - Tagline | Site Name
1401 # and so on and so forth; it's just not practical
1402 mobj = re.search(r'<title>(.*)</title>', webpage)
1404 self._downloader.trouble(u'ERROR: unable to extract title')
1406 video_title = mobj.group(1).decode('utf-8')
1408 # video uploader is domain name
1409 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1411 self._downloader.trouble(u'ERROR: unable to extract title')
1413 video_uploader = mobj.group(1).decode('utf-8')
1416 'id': video_id.decode('utf-8'),
1417 'url': video_url.decode('utf-8'),
1418 'uploader': video_uploader,
1419 'upload_date': None,
1420 'title': video_title,
1421 'ext': video_extension.decode('utf-8'),
1425 class YoutubeSearchIE(InfoExtractor):
1426 """Information Extractor for YouTube search queries."""
1427 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1428 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1429 _max_youtube_results = 1000
1430 IE_NAME = u'youtube:search'
1432 def __init__(self, downloader=None):
1433 InfoExtractor.__init__(self, downloader)
1435 def report_download_page(self, query, pagenum):
1436 """Report attempt to download search page with given number."""
1437 query = query.decode(preferredencoding())
1438 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1440 def _real_extract(self, query):
1441 mobj = re.match(self._VALID_URL, query)
1443 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1446 prefix, query = query.split(':')
1448 query = query.encode('utf-8')
1450 self._download_n_results(query, 1)
1452 elif prefix == 'all':
1453 self._download_n_results(query, self._max_youtube_results)
1459 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1461 elif n > self._max_youtube_results:
1462 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1463 n = self._max_youtube_results
1464 self._download_n_results(query, n)
1466 except ValueError: # parsing prefix as integer fails
1467 self._download_n_results(query, 1)
1470 def _download_n_results(self, query, n):
1471 """Downloads a specified number of results for a query"""
1477 while (50 * pagenum) < limit:
1478 self.report_download_page(query, pagenum+1)
1479 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1480 request = compat_urllib_request.Request(result_url)
1482 data = compat_urllib_request.urlopen(request).read()
1483 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1484 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1486 api_response = json.loads(data)['data']
1488 new_ids = list(video['id'] for video in api_response['items'])
1489 video_ids += new_ids
1491 limit = min(n, api_response['totalItems'])
1494 if len(video_ids) > n:
1495 video_ids = video_ids[:n]
1496 for id in video_ids:
1497 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1501 class GoogleSearchIE(InfoExtractor):
1502 """Information Extractor for Google Video search queries."""
1503 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1504 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1505 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1506 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1507 _max_google_results = 1000
1508 IE_NAME = u'video.google:search'
1510 def __init__(self, downloader=None):
1511 InfoExtractor.__init__(self, downloader)
1513 def report_download_page(self, query, pagenum):
1514 """Report attempt to download playlist page with given number."""
1515 query = query.decode(preferredencoding())
1516 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1518 def _real_extract(self, query):
1519 mobj = re.match(self._VALID_URL, query)
1521 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1524 prefix, query = query.split(':')
1526 query = query.encode('utf-8')
1528 self._download_n_results(query, 1)
1530 elif prefix == 'all':
1531 self._download_n_results(query, self._max_google_results)
1537 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1539 elif n > self._max_google_results:
1540 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1541 n = self._max_google_results
1542 self._download_n_results(query, n)
1544 except ValueError: # parsing prefix as integer fails
1545 self._download_n_results(query, 1)
1548 def _download_n_results(self, query, n):
1549 """Downloads a specified number of results for a query"""
1555 self.report_download_page(query, pagenum)
1556 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1557 request = compat_urllib_request.Request(result_url)
1559 page = compat_urllib_request.urlopen(request).read()
1560 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1561 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1564 # Extract video identifiers
1565 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1566 video_id = mobj.group(1)
1567 if video_id not in video_ids:
1568 video_ids.append(video_id)
1569 if len(video_ids) == n:
1570 # Specified n videos reached
1571 for id in video_ids:
1572 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1575 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1576 for id in video_ids:
1577 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1580 pagenum = pagenum + 1
1583 class YahooSearchIE(InfoExtractor):
1584 """Information Extractor for Yahoo! Video search queries."""
1585 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1586 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1587 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1588 _MORE_PAGES_INDICATOR = r'\s*Next'
1589 _max_yahoo_results = 1000
1590 IE_NAME = u'video.yahoo:search'
1592 def __init__(self, downloader=None):
1593 InfoExtractor.__init__(self, downloader)
1595 def report_download_page(self, query, pagenum):
1596 """Report attempt to download playlist page with given number."""
1597 query = query.decode(preferredencoding())
1598 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1600 def _real_extract(self, query):
1601 mobj = re.match(self._VALID_URL, query)
1603 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1606 prefix, query = query.split(':')
1608 query = query.encode('utf-8')
1610 self._download_n_results(query, 1)
1612 elif prefix == 'all':
1613 self._download_n_results(query, self._max_yahoo_results)
1619 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1621 elif n > self._max_yahoo_results:
1622 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1623 n = self._max_yahoo_results
1624 self._download_n_results(query, n)
1626 except ValueError: # parsing prefix as integer fails
1627 self._download_n_results(query, 1)
1630 def _download_n_results(self, query, n):
1631 """Downloads a specified number of results for a query"""
1634 already_seen = set()
1638 self.report_download_page(query, pagenum)
1639 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1640 request = compat_urllib_request.Request(result_url)
1642 page = compat_urllib_request.urlopen(request).read()
1643 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1644 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1647 # Extract video identifiers
1648 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1649 video_id = mobj.group(1)
1650 if video_id not in already_seen:
1651 video_ids.append(video_id)
1652 already_seen.add(video_id)
1653 if len(video_ids) == n:
1654 # Specified n videos reached
1655 for id in video_ids:
1656 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1659 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1660 for id in video_ids:
1661 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1664 pagenum = pagenum + 1
1667 class YoutubePlaylistIE(InfoExtractor):
1668 """Information Extractor for YouTube playlists."""
1670 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1671 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1672 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1673 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1674 IE_NAME = u'youtube:playlist'
1676 def __init__(self, downloader=None):
1677 InfoExtractor.__init__(self, downloader)
1679 def report_download_page(self, playlist_id, pagenum):
1680 """Report attempt to download playlist page with given number."""
1681 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1683 def _real_extract(self, url):
1684 # Extract playlist id
1685 mobj = re.match(self._VALID_URL, url)
1687 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1691 if mobj.group(3) is not None:
1692 self._downloader.download([mobj.group(3)])
1695 # Download playlist pages
1696 # prefix is 'p' as default for playlists but there are other types that need extra care
1697 playlist_prefix = mobj.group(1)
1698 if playlist_prefix == 'a':
1699 playlist_access = 'artist'
1701 playlist_prefix = 'p'
1702 playlist_access = 'view_play_list'
1703 playlist_id = mobj.group(2)
1708 self.report_download_page(playlist_id, pagenum)
1709 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1710 request = compat_urllib_request.Request(url)
1712 page = compat_urllib_request.urlopen(request).read()
1713 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1714 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1717 # Extract video identifiers
1719 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1720 if mobj.group(1) not in ids_in_page:
1721 ids_in_page.append(mobj.group(1))
1722 video_ids.extend(ids_in_page)
1724 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1726 pagenum = pagenum + 1
1728 playliststart = self._downloader.params.get('playliststart', 1) - 1
1729 playlistend = self._downloader.params.get('playlistend', -1)
1730 if playlistend == -1:
1731 video_ids = video_ids[playliststart:]
1733 video_ids = video_ids[playliststart:playlistend]
1735 for id in video_ids:
1736 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1740 class YoutubeChannelIE(InfoExtractor):
1741 """Information Extractor for YouTube channels."""
1743 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1744 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1745 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1746 IE_NAME = u'youtube:channel'
1748 def report_download_page(self, channel_id, pagenum):
1749 """Report attempt to download channel page with given number."""
1750 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1752 def _real_extract(self, url):
1753 # Extract channel id
1754 mobj = re.match(self._VALID_URL, url)
1756 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1759 # Download channel pages
1760 channel_id = mobj.group(1)
1765 self.report_download_page(channel_id, pagenum)
1766 url = self._TEMPLATE_URL % (channel_id, pagenum)
1767 request = compat_urllib_request.Request(url)
1769 page = compat_urllib_request.urlopen(request).read()
1770 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1771 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1774 # Extract video identifiers
1776 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1777 if mobj.group(1) not in ids_in_page:
1778 ids_in_page.append(mobj.group(1))
1779 video_ids.extend(ids_in_page)
1781 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1783 pagenum = pagenum + 1
1785 for id in video_ids:
1786 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1790 class YoutubeUserIE(InfoExtractor):
1791 """Information Extractor for YouTube users."""
1793 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1794 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1795 _GDATA_PAGE_SIZE = 50
1796 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1797 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1798 IE_NAME = u'youtube:user'
1800 def __init__(self, downloader=None):
1801 InfoExtractor.__init__(self, downloader)
1803 def report_download_page(self, username, start_index):
1804 """Report attempt to download user page."""
1805 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1806 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1808 def _real_extract(self, url):
1810 mobj = re.match(self._VALID_URL, url)
1812 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1815 username = mobj.group(1)
1817 # Download video ids using YouTube Data API. Result size per
1818 # query is limited (currently to 50 videos) so we need to query
1819 # page by page until there are no video ids - it means we got
1826 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1827 self.report_download_page(username, start_index)
1829 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1832 page = compat_urllib_request.urlopen(request).read()
1833 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1834 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1837 # Extract video identifiers
1840 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1841 if mobj.group(1) not in ids_in_page:
1842 ids_in_page.append(mobj.group(1))
1844 video_ids.extend(ids_in_page)
1846 # A little optimization - if current page is not
1847 # "full", ie. does not contain PAGE_SIZE video ids then
1848 # we can assume that this page is the last one - there
1849 # are no more ids on further pages - no need to query
1852 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1857 all_ids_count = len(video_ids)
1858 playliststart = self._downloader.params.get('playliststart', 1) - 1
1859 playlistend = self._downloader.params.get('playlistend', -1)
1861 if playlistend == -1:
1862 video_ids = video_ids[playliststart:]
1864 video_ids = video_ids[playliststart:playlistend]
1866 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1867 (username, all_ids_count, len(video_ids)))
1869 for video_id in video_ids:
1870 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1873 class BlipTVUserIE(InfoExtractor):
1874 """Information Extractor for blip.tv users."""
1876 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1878 IE_NAME = u'blip.tv:user'
1880 def __init__(self, downloader=None):
1881 InfoExtractor.__init__(self, downloader)
1883 def report_download_page(self, username, pagenum):
1884 """Report attempt to download user page."""
1885 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1886 (self.IE_NAME, username, pagenum))
1888 def _real_extract(self, url):
1890 mobj = re.match(self._VALID_URL, url)
1892 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1895 username = mobj.group(1)
1897 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1899 request = compat_urllib_request.Request(url)
1902 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1903 mobj = re.search(r'data-users-id="([^"]+)"', page)
1904 page_base = page_base % mobj.group(1)
1905 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1906 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1910 # Download video ids using BlipTV Ajax calls. Result size per
1911 # query is limited (currently to 12 videos) so we need to query
1912 # page by page until there are no video ids - it means we got
1919 self.report_download_page(username, pagenum)
1921 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1924 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1925 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1926 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1929 # Extract video identifiers
1932 for mobj in re.finditer(r'href="/([^"]+)"', page):
1933 if mobj.group(1) not in ids_in_page:
1934 ids_in_page.append(unescapeHTML(mobj.group(1)))
1936 video_ids.extend(ids_in_page)
1938 # A little optimization - if current page is not
1939 # "full", ie. does not contain PAGE_SIZE video ids then
1940 # we can assume that this page is the last one - there
1941 # are no more ids on further pages - no need to query
1944 if len(ids_in_page) < self._PAGE_SIZE:
1949 all_ids_count = len(video_ids)
1950 playliststart = self._downloader.params.get('playliststart', 1) - 1
1951 playlistend = self._downloader.params.get('playlistend', -1)
1953 if playlistend == -1:
1954 video_ids = video_ids[playliststart:]
1956 video_ids = video_ids[playliststart:playlistend]
1958 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1959 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1961 for video_id in video_ids:
1962 self._downloader.download([u'http://blip.tv/'+video_id])
1965 class DepositFilesIE(InfoExtractor):
1966 """Information extractor for depositfiles.com"""
1968 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1969 IE_NAME = u'DepositFiles'
1971 def __init__(self, downloader=None):
1972 InfoExtractor.__init__(self, downloader)
1974 def report_download_webpage(self, file_id):
1975 """Report webpage download."""
1976 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1978 def report_extraction(self, file_id):
1979 """Report information extraction."""
1980 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1982 def _real_extract(self, url):
1983 file_id = url.split('/')[-1]
1984 # Rebuild url in english locale
1985 url = 'http://depositfiles.com/en/files/' + file_id
1987 # Retrieve file webpage with 'Free download' button pressed
1988 free_download_indication = { 'gateway_result' : '1' }
1989 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1991 self.report_download_webpage(file_id)
1992 webpage = compat_urllib_request.urlopen(request).read()
1993 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
1994 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1997 # Search for the real file URL
1998 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1999 if (mobj is None) or (mobj.group(1) is None):
2000 # Try to figure out reason of the error.
2001 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2002 if (mobj is not None) and (mobj.group(1) is not None):
2003 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2004 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2006 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2009 file_url = mobj.group(1)
2010 file_extension = os.path.splitext(file_url)[1][1:]
2012 # Search for file title
2013 mobj = re.search(r'<b title="(.*?)">', webpage)
2015 self._downloader.trouble(u'ERROR: unable to extract title')
2017 file_title = mobj.group(1).decode('utf-8')
2020 'id': file_id.decode('utf-8'),
2021 'url': file_url.decode('utf-8'),
2023 'upload_date': None,
2024 'title': file_title,
2025 'ext': file_extension.decode('utf-8'),
2029 class FacebookIE(InfoExtractor):
2030 """Information Extractor for Facebook"""
2033 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2034 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2035 _NETRC_MACHINE = 'facebook'
2036 _available_formats = ['video', 'highqual', 'lowqual']
2037 _video_extensions = {
2042 IE_NAME = u'facebook'
2044 def __init__(self, downloader=None):
2045 InfoExtractor.__init__(self, downloader)
2047 def _reporter(self, message):
2048 """Add header and report message."""
2049 self._downloader.to_screen(u'[facebook] %s' % message)
2051 def report_login(self):
2052 """Report attempt to log in."""
2053 self._reporter(u'Logging in')
2055 def report_video_webpage_download(self, video_id):
2056 """Report attempt to download video webpage."""
2057 self._reporter(u'%s: Downloading video webpage' % video_id)
2059 def report_information_extraction(self, video_id):
2060 """Report attempt to extract video information."""
2061 self._reporter(u'%s: Extracting video information' % video_id)
2063 def _parse_page(self, video_webpage):
2064 """Extract video information from page"""
2066 data = {'title': r'\("video_title", "(.*?)"\)',
2067 'description': r'<div class="datawrap">(.*?)</div>',
2068 'owner': r'\("video_owner_name", "(.*?)"\)',
2069 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2072 for piece in data.keys():
2073 mobj = re.search(data[piece], video_webpage)
2074 if mobj is not None:
2075 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2079 for fmt in self._available_formats:
2080 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2081 if mobj is not None:
2082 # URL is in a Javascript segment inside an escaped Unicode format within
2083 # the generally utf-8 page
2084 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2085 video_info['video_urls'] = video_urls
2089 def _real_initialize(self):
2090 if self._downloader is None:
2095 downloader_params = self._downloader.params
2097 # Attempt to use provided username and password or .netrc data
2098 if downloader_params.get('username', None) is not None:
2099 useremail = downloader_params['username']
2100 password = downloader_params['password']
2101 elif downloader_params.get('usenetrc', False):
2103 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2104 if info is not None:
2108 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2109 except (IOError, netrc.NetrcParseError) as err:
2110 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2113 if useremail is None:
2122 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2125 login_results = compat_urllib_request.urlopen(request).read()
2126 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2127 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2129 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2130 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2133 def _real_extract(self, url):
2134 mobj = re.match(self._VALID_URL, url)
2136 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2138 video_id = mobj.group('ID')
2141 self.report_video_webpage_download(video_id)
2142 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2144 page = compat_urllib_request.urlopen(request)
2145 video_webpage = page.read()
2146 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2147 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2150 # Start extracting information
2151 self.report_information_extraction(video_id)
2153 # Extract information
2154 video_info = self._parse_page(video_webpage)
2157 if 'owner' not in video_info:
2158 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2160 video_uploader = video_info['owner']
2163 if 'title' not in video_info:
2164 self._downloader.trouble(u'ERROR: unable to extract video title')
2166 video_title = video_info['title']
2167 video_title = video_title.decode('utf-8')
2170 if 'thumbnail' not in video_info:
2171 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2172 video_thumbnail = ''
2174 video_thumbnail = video_info['thumbnail']
2178 if 'upload_date' in video_info:
2179 upload_time = video_info['upload_date']
2180 timetuple = email.utils.parsedate_tz(upload_time)
2181 if timetuple is not None:
2183 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2188 video_description = video_info.get('description', 'No description available.')
2190 url_map = video_info['video_urls']
2191 if len(url_map.keys()) > 0:
2192 # Decide which formats to download
2193 req_format = self._downloader.params.get('format', None)
2194 format_limit = self._downloader.params.get('format_limit', None)
2196 if format_limit is not None and format_limit in self._available_formats:
2197 format_list = self._available_formats[self._available_formats.index(format_limit):]
2199 format_list = self._available_formats
2200 existing_formats = [x for x in format_list if x in url_map]
2201 if len(existing_formats) == 0:
2202 self._downloader.trouble(u'ERROR: no known formats available for video')
2204 if req_format is None:
2205 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2206 elif req_format == 'worst':
2207 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2208 elif req_format == '-1':
2209 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2212 if req_format not in url_map:
2213 self._downloader.trouble(u'ERROR: requested format not available')
2215 video_url_list = [(req_format, url_map[req_format])] # Specific format
2218 for format_param, video_real_url in video_url_list:
2220 video_extension = self._video_extensions.get(format_param, 'mp4')
2223 'id': video_id.decode('utf-8'),
2224 'url': video_real_url.decode('utf-8'),
2225 'uploader': video_uploader.decode('utf-8'),
2226 'upload_date': upload_date,
2227 'title': video_title,
2228 'ext': video_extension.decode('utf-8'),
2229 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2230 'thumbnail': video_thumbnail.decode('utf-8'),
2231 'description': video_description.decode('utf-8'),
2235 class BlipTVIE(InfoExtractor):
2236 """Information extractor for blip.tv"""
2238 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2239 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2240 IE_NAME = u'blip.tv'
2242 def report_extraction(self, file_id):
2243 """Report information extraction."""
2244 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2246 def report_direct_download(self, title):
2247 """Report information extraction."""
2248 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2250 def _real_extract(self, url):
2251 mobj = re.match(self._VALID_URL, url)
2253 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2260 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2261 request = compat_urllib_request.Request(json_url.encode('utf-8'))
2262 self.report_extraction(mobj.group(1))
2265 urlh = compat_urllib_request.urlopen(request)
2266 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2267 basename = url.split('/')[-1]
2268 title,ext = os.path.splitext(basename)
2269 title = title.decode('UTF-8')
2270 ext = ext.replace('.', '')
2271 self.report_direct_download(title)
2276 'upload_date': None,
2281 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2282 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2284 if info is None: # Regular URL
2286 json_code = urlh.read()
2287 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2288 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2292 json_data = json.loads(json_code)
2293 if 'Post' in json_data:
2294 data = json_data['Post']
2298 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2299 video_url = data['media']['url']
2300 umobj = re.match(self._URL_EXT, video_url)
2302 raise ValueError('Can not determine filename extension')
2303 ext = umobj.group(1)
2306 'id': data['item_id'],
2308 'uploader': data['display_name'],
2309 'upload_date': upload_date,
2310 'title': data['title'],
2312 'format': data['media']['mimeType'],
2313 'thumbnail': data['thumbnailUrl'],
2314 'description': data['description'],
2315 'player_url': data['embedUrl']
2317 except (ValueError,KeyError) as err:
2318 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2321 std_headers['User-Agent'] = 'iTunes/10.6.1'
2325 class MyVideoIE(InfoExtractor):
2326 """Information Extractor for myvideo.de."""
2328 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2329 IE_NAME = u'myvideo'
2331 def __init__(self, downloader=None):
2332 InfoExtractor.__init__(self, downloader)
2334 def report_download_webpage(self, video_id):
2335 """Report webpage download."""
2336 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2338 def report_extraction(self, video_id):
2339 """Report information extraction."""
2340 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2342 def _real_extract(self,url):
2343 mobj = re.match(self._VALID_URL, url)
2345 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2348 video_id = mobj.group(1)
2351 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2353 self.report_download_webpage(video_id)
2354 webpage = compat_urllib_request.urlopen(request).read()
2355 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2356 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2359 self.report_extraction(video_id)
2360 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2363 self._downloader.trouble(u'ERROR: unable to extract media URL')
2365 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2367 mobj = re.search('<title>([^<]+)</title>', webpage)
2369 self._downloader.trouble(u'ERROR: unable to extract title')
2372 video_title = mobj.group(1)
2378 'upload_date': None,
2379 'title': video_title,
2383 class ComedyCentralIE(InfoExtractor):
2384 """Information extractor for The Daily Show and Colbert Report """
2386 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2387 IE_NAME = u'comedycentral'
2389 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2391 _video_extensions = {
2399 _video_dimensions = {
2408 def report_extraction(self, episode_id):
2409 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2411 def report_config_download(self, episode_id):
2412 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2414 def report_index_download(self, episode_id):
2415 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2417 def report_player_url(self, episode_id):
2418 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2421 def _print_formats(self, formats):
2422 print('Available formats:')
2424 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2427 def _real_extract(self, url):
2428 mobj = re.match(self._VALID_URL, url)
2430 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2433 if mobj.group('shortname'):
2434 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2435 url = u'http://www.thedailyshow.com/full-episodes/'
2437 url = u'http://www.colbertnation.com/full-episodes/'
2438 mobj = re.match(self._VALID_URL, url)
2439 assert mobj is not None
2441 dlNewest = not mobj.group('episode')
2443 epTitle = mobj.group('showname')
2445 epTitle = mobj.group('episode')
2447 req = compat_urllib_request.Request(url)
2448 self.report_extraction(epTitle)
2450 htmlHandle = compat_urllib_request.urlopen(req)
2451 html = htmlHandle.read()
2452 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2453 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2456 url = htmlHandle.geturl()
2457 mobj = re.match(self._VALID_URL, url)
2459 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2461 if mobj.group('episode') == '':
2462 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2464 epTitle = mobj.group('episode')
2466 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2468 if len(mMovieParams) == 0:
2469 # The Colbert Report embeds the information in a without
2470 # a URL prefix; so extract the alternate reference
2471 # and then add the URL prefix manually.
2473 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2474 if len(altMovieParams) == 0:
2475 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2478 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2480 playerUrl_raw = mMovieParams[0][0]
2481 self.report_player_url(epTitle)
2483 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2484 playerUrl = urlHandle.geturl()
2485 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2486 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2489 uri = mMovieParams[0][1]
2490 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2491 self.report_index_download(epTitle)
2493 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2494 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2495 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2500 idoc = xml.etree.ElementTree.fromstring(indexXml)
2501 itemEls = idoc.findall('.//item')
2502 for itemEl in itemEls:
2503 mediaId = itemEl.findall('./guid')[0].text
2504 shortMediaId = mediaId.split(':')[-1]
2505 showId = mediaId.split(':')[-2].replace('.com', '')
2506 officialTitle = itemEl.findall('./title')[0].text
2507 officialDate = itemEl.findall('./pubDate')[0].text
2509 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2510 compat_urllib_parse.urlencode({'uri': mediaId}))
2511 configReq = compat_urllib_request.Request(configUrl)
2512 self.report_config_download(epTitle)
2514 configXml = compat_urllib_request.urlopen(configReq).read()
2515 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2516 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2519 cdoc = xml.etree.ElementTree.fromstring(configXml)
2521 for rendition in cdoc.findall('.//rendition'):
2522 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2526 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2529 if self._downloader.params.get('listformats', None):
2530 self._print_formats([i[0] for i in turls])
2533 # For now, just pick the highest bitrate
2534 format,video_url = turls[-1]
2536 # Get the format arg from the arg stream
2537 req_format = self._downloader.params.get('format', None)
2539 # Select format if we can find one
2542 format, video_url = f, v
2545 # Patch to download from alternative CDN, which does not
2546 # break on current RTMPDump builds
2547 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2548 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2550 if video_url.startswith(broken_cdn):
2551 video_url = video_url.replace(broken_cdn, better_cdn)
2553 effTitle = showId + u'-' + epTitle
2558 'upload_date': officialDate,
2563 'description': officialTitle,
2564 'player_url': None #playerUrl
2567 results.append(info)
2572 class EscapistIE(InfoExtractor):
2573 """Information extractor for The Escapist """
2575 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2576 IE_NAME = u'escapist'
2578 def report_extraction(self, showName):
2579 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2581 def report_config_download(self, showName):
2582 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2584 def _real_extract(self, url):
2585 mobj = re.match(self._VALID_URL, url)
2587 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2589 showName = mobj.group('showname')
2590 videoId = mobj.group('episode')
2592 self.report_extraction(showName)
2594 webPage = compat_urllib_request.urlopen(url)
2595 webPageBytes = webPage.read()
2596 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2597 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2598 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2599 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2602 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2603 description = unescapeHTML(descMatch.group(1))
2604 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2605 imgUrl = unescapeHTML(imgMatch.group(1))
2606 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2607 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2608 configUrlMatch = re.search('config=(.*)$', playerUrl)
2609 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2611 self.report_config_download(showName)
2613 configJSON = compat_urllib_request.urlopen(configUrl).read()
2614 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2615 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2618 # Technically, it's JavaScript, not JSON
2619 configJSON = configJSON.replace("'", '"')
2622 config = json.loads(configJSON)
2623 except (ValueError,) as err:
2624 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2627 playlist = config['playlist']
2628 videoUrl = playlist[1]['url']
2633 'uploader': showName,
2634 'upload_date': None,
2637 'thumbnail': imgUrl,
2638 'description': description,
2639 'player_url': playerUrl,
2645 class CollegeHumorIE(InfoExtractor):
2646 """Information extractor for collegehumor.com"""
2648 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2649 IE_NAME = u'collegehumor'
2651 def report_webpage(self, video_id):
2652 """Report information extraction."""
2653 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2655 def report_extraction(self, video_id):
2656 """Report information extraction."""
2657 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2659 def _real_extract(self, url):
2660 mobj = re.match(self._VALID_URL, url)
2662 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2664 video_id = mobj.group('videoid')
2666 self.report_webpage(video_id)
2667 request = compat_urllib_request.Request(url)
2669 webpage = compat_urllib_request.urlopen(request).read()
2670 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2671 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2674 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2676 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2678 internal_video_id = m.group('internalvideoid')
2682 'internal_id': internal_video_id,
2684 'upload_date': None,
2687 self.report_extraction(video_id)
2688 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2690 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2691 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2692 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2695 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2697 videoNode = mdoc.findall('./video')[0]
2698 info['description'] = videoNode.findall('./description')[0].text
2699 info['title'] = videoNode.findall('./caption')[0].text
2700 info['url'] = videoNode.findall('./file')[0].text
2701 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2702 info['ext'] = info['url'].rpartition('.')[2]
2704 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2710 class XVideosIE(InfoExtractor):
2711 """Information extractor for xvideos.com"""
2713 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2714 IE_NAME = u'xvideos'
2716 def report_webpage(self, video_id):
2717 """Report information extraction."""
2718 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2720 def report_extraction(self, video_id):
2721 """Report information extraction."""
2722 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2724 def _real_extract(self, url):
2725 mobj = re.match(self._VALID_URL, url)
2727 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2729 video_id = mobj.group(1).decode('utf-8')
2731 self.report_webpage(video_id)
2733 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2735 webpage = compat_urllib_request.urlopen(request).read()
2736 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2737 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2740 self.report_extraction(video_id)
2744 mobj = re.search(r'flv_url=(.+?)&', webpage)
2746 self._downloader.trouble(u'ERROR: unable to extract video url')
2748 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2752 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2754 self._downloader.trouble(u'ERROR: unable to extract video title')
2756 video_title = mobj.group(1).decode('utf-8')
2759 # Extract video thumbnail
2760 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2762 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2764 video_thumbnail = mobj.group(0).decode('utf-8')
2770 'upload_date': None,
2771 'title': video_title,
2773 'thumbnail': video_thumbnail,
2774 'description': None,
2780 class SoundcloudIE(InfoExtractor):
2781 """Information extractor for soundcloud.com
2782 To access the media, the uid of the song and a stream token
2783 must be extracted from the page source and the script must make
2784 a request to media.soundcloud.com/crossdomain.xml. Then
2785 the media can be grabbed by requesting from an url composed
2786 of the stream token and uid
2789 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2790 IE_NAME = u'soundcloud'
2792 def __init__(self, downloader=None):
2793 InfoExtractor.__init__(self, downloader)
2795 def report_webpage(self, video_id):
2796 """Report information extraction."""
2797 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2799 def report_extraction(self, video_id):
2800 """Report information extraction."""
2801 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2803 def _real_extract(self, url):
2804 mobj = re.match(self._VALID_URL, url)
2806 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2809 # extract uploader (which is in the url)
2810 uploader = mobj.group(1).decode('utf-8')
2811 # extract simple title (uploader + slug of song title)
2812 slug_title = mobj.group(2).decode('utf-8')
2813 simple_title = uploader + u'-' + slug_title
2815 self.report_webpage('%s/%s' % (uploader, slug_title))
2817 request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2819 webpage = compat_urllib_request.urlopen(request).read()
2820 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2821 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2824 self.report_extraction('%s/%s' % (uploader, slug_title))
2826 # extract uid and stream token that soundcloud hands out for access
2827 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2829 video_id = mobj.group(1)
2830 stream_token = mobj.group(2)
2832 # extract unsimplified title
2833 mobj = re.search('"title":"(.*?)",', webpage)
2835 title = mobj.group(1).decode('utf-8')
2837 title = simple_title
2839 # construct media url (with uid/token)
2840 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2841 mediaURL = mediaURL % (video_id, stream_token)
2844 description = u'No description available'
2845 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2847 description = mobj.group(1)
2851 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2854 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2855 except Exception, e:
2856 self._downloader.to_stderr(compat_str(e))
2858 # for soundcloud, a request to a cross domain is required for cookies
2859 request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2862 'id': video_id.decode('utf-8'),
2864 'uploader': uploader.decode('utf-8'),
2865 'upload_date': upload_date,
2868 'description': description.decode('utf-8')
2872 class InfoQIE(InfoExtractor):
2873 """Information extractor for infoq.com"""
2875 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2878 def report_webpage(self, video_id):
2879 """Report information extraction."""
2880 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2882 def report_extraction(self, video_id):
2883 """Report information extraction."""
2884 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2886 def _real_extract(self, url):
2887 mobj = re.match(self._VALID_URL, url)
2889 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2892 self.report_webpage(url)
2894 request = compat_urllib_request.Request(url)
2896 webpage = compat_urllib_request.urlopen(request).read()
2897 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2898 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2901 self.report_extraction(url)
2905 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2907 self._downloader.trouble(u'ERROR: unable to extract video url')
2909 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2913 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2915 self._downloader.trouble(u'ERROR: unable to extract video title')
2917 video_title = mobj.group(1).decode('utf-8')
2919 # Extract description
2920 video_description = u'No description available.'
2921 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2922 if mobj is not None:
2923 video_description = mobj.group(1).decode('utf-8')
2925 video_filename = video_url.split('/')[-1]
2926 video_id, extension = video_filename.split('.')
2932 'upload_date': None,
2933 'title': video_title,
2934 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2936 'description': video_description,
2941 class MixcloudIE(InfoExtractor):
2942 """Information extractor for www.mixcloud.com"""
2943 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2944 IE_NAME = u'mixcloud'
2946 def __init__(self, downloader=None):
2947 InfoExtractor.__init__(self, downloader)
2949 def report_download_json(self, file_id):
2950 """Report JSON download."""
2951 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2953 def report_extraction(self, file_id):
2954 """Report information extraction."""
2955 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2957 def get_urls(self, jsonData, fmt, bitrate='best'):
2958 """Get urls from 'audio_formats' section in json"""
2961 bitrate_list = jsonData[fmt]
2962 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2963 bitrate = max(bitrate_list) # select highest
2965 url_list = jsonData[fmt][bitrate]
2966 except TypeError: # we have no bitrate info.
2967 url_list = jsonData[fmt]
2970 def check_urls(self, url_list):
2971 """Returns 1st active url from list"""
2972 for url in url_list:
2974 compat_urllib_request.urlopen(url)
2976 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
2981 def _print_formats(self, formats):
2982 print('Available formats:')
2983 for fmt in formats.keys():
2984 for b in formats[fmt]:
2986 ext = formats[fmt][b][0]
2987 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2988 except TypeError: # we have no bitrate info
2989 ext = formats[fmt][0]
2990 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2993 def _real_extract(self, url):
2994 mobj = re.match(self._VALID_URL, url)
2996 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2998 # extract uploader & filename from url
2999 uploader = mobj.group(1).decode('utf-8')
3000 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3002 # construct API request
3003 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3004 # retrieve .json file with links to files
3005 request = compat_urllib_request.Request(file_url)
3007 self.report_download_json(file_url)
3008 jsonData = compat_urllib_request.urlopen(request).read()
3009 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3010 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3014 json_data = json.loads(jsonData)
3015 player_url = json_data['player_swf_url']
3016 formats = dict(json_data['audio_formats'])
3018 req_format = self._downloader.params.get('format', None)
3021 if self._downloader.params.get('listformats', None):
3022 self._print_formats(formats)
3025 if req_format is None or req_format == 'best':
3026 for format_param in formats.keys():
3027 url_list = self.get_urls(formats, format_param)
3029 file_url = self.check_urls(url_list)
3030 if file_url is not None:
3033 if req_format not in formats.keys():
3034 self._downloader.trouble(u'ERROR: format is not available')
3037 url_list = self.get_urls(formats, req_format)
3038 file_url = self.check_urls(url_list)
3039 format_param = req_format
3042 'id': file_id.decode('utf-8'),
3043 'url': file_url.decode('utf-8'),
3044 'uploader': uploader.decode('utf-8'),
3045 'upload_date': None,
3046 'title': json_data['name'],
3047 'ext': file_url.split('.')[-1].decode('utf-8'),
3048 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3049 'thumbnail': json_data['thumbnail_url'],
3050 'description': json_data['description'],
3051 'player_url': player_url.decode('utf-8'),
3054 class StanfordOpenClassroomIE(InfoExtractor):
3055 """Information extractor for Stanford's Open ClassRoom"""
3057 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3058 IE_NAME = u'stanfordoc'
3060 def report_download_webpage(self, objid):
3061 """Report information extraction."""
3062 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3064 def report_extraction(self, video_id):
3065 """Report information extraction."""
3066 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3068 def _real_extract(self, url):
3069 mobj = re.match(self._VALID_URL, url)
3071 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3074 if mobj.group('course') and mobj.group('video'): # A specific video
3075 course = mobj.group('course')
3076 video = mobj.group('video')
3078 'id': course + '_' + video,
3080 'upload_date': None,
3083 self.report_extraction(info['id'])
3084 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3085 xmlUrl = baseUrl + video + '.xml'
3087 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3088 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3089 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3091 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3093 info['title'] = mdoc.findall('./title')[0].text
3094 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3096 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3098 info['ext'] = info['url'].rpartition('.')[2]
3100 elif mobj.group('course'): # A course page
3101 course = mobj.group('course')
3106 'upload_date': None,
3109 self.report_download_webpage(info['id'])
3111 coursepage = compat_urllib_request.urlopen(url).read()
3112 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3113 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3116 m = re.search('<h1>([^<]+)</h1>', coursepage)
3118 info['title'] = unescapeHTML(m.group(1))
3120 info['title'] = info['id']
3122 m = re.search('<description>([^<]+)</description>', coursepage)
3124 info['description'] = unescapeHTML(m.group(1))
3126 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3129 'type': 'reference',
3130 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3134 for entry in info['list']:
3135 assert entry['type'] == 'reference'
3136 results += self.extract(entry['url'])
3141 'id': 'Stanford OpenClassroom',
3144 'upload_date': None,
3147 self.report_download_webpage(info['id'])
3148 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3150 rootpage = compat_urllib_request.urlopen(rootURL).read()
3151 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3152 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3155 info['title'] = info['id']
3157 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3160 'type': 'reference',
3161 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3166 for entry in info['list']:
3167 assert entry['type'] == 'reference'
3168 results += self.extract(entry['url'])
3171 class MTVIE(InfoExtractor):
3172 """Information extractor for MTV.com"""
3174 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3177 def report_webpage(self, video_id):
3178 """Report information extraction."""
3179 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3181 def report_extraction(self, video_id):
3182 """Report information extraction."""
3183 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3185 def _real_extract(self, url):
3186 mobj = re.match(self._VALID_URL, url)
3188 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3190 if not mobj.group('proto'):
3191 url = 'http://' + url
3192 video_id = mobj.group('videoid')
3193 self.report_webpage(video_id)
3195 request = compat_urllib_request.Request(url)
3197 webpage = compat_urllib_request.urlopen(request).read()
3198 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3199 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3202 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3204 self._downloader.trouble(u'ERROR: unable to extract song name')
3206 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3207 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3209 self._downloader.trouble(u'ERROR: unable to extract performer')
3211 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3212 video_title = performer + ' - ' + song_name
3214 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3216 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3218 mtvn_uri = mobj.group(1)
3220 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3222 self._downloader.trouble(u'ERROR: unable to extract content id')
3224 content_id = mobj.group(1)
3226 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3227 self.report_extraction(video_id)
3228 request = compat_urllib_request.Request(videogen_url)
3230 metadataXml = compat_urllib_request.urlopen(request).read()
3231 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3232 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3235 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3236 renditions = mdoc.findall('.//rendition')
3238 # For now, always pick the highest quality.
3239 rendition = renditions[-1]
3242 _,_,ext = rendition.attrib['type'].partition('/')
3243 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3244 video_url = rendition.find('./src').text
3246 self._downloader.trouble('Invalid rendition field.')
3252 'uploader': performer,
3253 'upload_date': None,
3254 'title': video_title,
3262 class YoukuIE(InfoExtractor):
3264 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3267 def __init__(self, downloader=None):
3268 InfoExtractor.__init__(self, downloader)
3270 def report_download_webpage(self, file_id):
3271 """Report webpage download."""
3272 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3274 def report_extraction(self, file_id):
3275 """Report information extraction."""
3276 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3279 nowTime = int(time.time() * 1000)
3280 random1 = random.randint(1000,1998)
3281 random2 = random.randint(1000,9999)
3283 return "%d%d%d" %(nowTime,random1,random2)
3285 def _get_file_ID_mix_string(self, seed):
3287 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3289 for i in range(len(source)):
3290 seed = (seed * 211 + 30031 ) % 65536
3291 index = math.floor(seed / 65536 * len(source) )
3292 mixed.append(source[int(index)])
3293 source.remove(source[int(index)])
3294 #return ''.join(mixed)
3297 def _get_file_id(self, fileId, seed):
3298 mixed = self._get_file_ID_mix_string(seed)
3299 ids = fileId.split('*')
3303 realId.append(mixed[int(ch)])
3304 return ''.join(realId)
3306 def _real_extract(self, url):
3307 mobj = re.match(self._VALID_URL, url)
3309 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3311 video_id = mobj.group('ID')
3313 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3315 request = compat_urllib_request.Request(info_url, None, std_headers)
3317 self.report_download_webpage(video_id)
3318 jsondata = compat_urllib_request.urlopen(request).read()
3319 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3320 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3323 self.report_extraction(video_id)
3325 config = json.loads(jsondata)
3327 video_title = config['data'][0]['title']
3328 seed = config['data'][0]['seed']
3330 format = self._downloader.params.get('format', None)
3331 supported_format = config['data'][0]['streamfileids'].keys()
3333 if format is None or format == 'best':
3334 if 'hd2' in supported_format:
3339 elif format == 'worst':
3347 fileid = config['data'][0]['streamfileids'][format]
3348 seg_number = len(config['data'][0]['segs'][format])
3351 for i in xrange(seg_number):
3352 keys.append(config['data'][0]['segs'][format][i]['k'])
3355 #youku only could be viewed from mainland china
3357 self._downloader.trouble(u'ERROR: unable to extract info section')
3361 sid = self._gen_sid()
3362 fileid = self._get_file_id(fileid, seed)
3364 #column 8,9 of fileid represent the segment number
3365 #fileid[7:9] should be changed
3366 for index, key in enumerate(keys):
3368 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3369 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3372 'id': '%s_part%02d' % (video_id, index),
3373 'url': download_url,
3375 'upload_date': None,
3376 'title': video_title,
3379 files_info.append(info)
3384 class XNXXIE(InfoExtractor):
3385 """Information extractor for xnxx.com"""
3387 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3389 VIDEO_URL_RE = r'flv_url=(.*?)&'
3390 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3391 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3393 def report_webpage(self, video_id):
3394 """Report information extraction"""
3395 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3397 def report_extraction(self, video_id):
3398 """Report information extraction"""
3399 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3401 def _real_extract(self, url):
3402 mobj = re.match(self._VALID_URL, url)
3404 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3406 video_id = mobj.group(1).decode('utf-8')
3408 self.report_webpage(video_id)
3410 # Get webpage content
3412 webpage = compat_urllib_request.urlopen(url).read()
3413 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3414 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3417 result = re.search(self.VIDEO_URL_RE, webpage)
3419 self._downloader.trouble(u'ERROR: unable to extract video url')
3421 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3423 result = re.search(self.VIDEO_TITLE_RE, webpage)
3425 self._downloader.trouble(u'ERROR: unable to extract video title')
3427 video_title = result.group(1).decode('utf-8')
3429 result = re.search(self.VIDEO_THUMB_RE, webpage)
3431 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3433 video_thumbnail = result.group(1).decode('utf-8')
3439 'upload_date': None,
3440 'title': video_title,
3442 'thumbnail': video_thumbnail,
3443 'description': None,
3447 class GooglePlusIE(InfoExtractor):
3448 """Information extractor for plus.google.com."""
3450 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3451 IE_NAME = u'plus.google'
3453 def __init__(self, downloader=None):
3454 InfoExtractor.__init__(self, downloader)
3456 def report_extract_entry(self, url):
3457 """Report downloading extry"""
3458 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3460 def report_date(self, upload_date):
3461 """Report downloading extry"""
3462 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3464 def report_uploader(self, uploader):
3465 """Report downloading extry"""
3466 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3468 def report_title(self, video_title):
3469 """Report downloading extry"""
3470 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3472 def report_extract_vid_page(self, video_page):
3473 """Report information extraction."""
3474 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3476 def _real_extract(self, url):
3477 # Extract id from URL
3478 mobj = re.match(self._VALID_URL, url)
3480 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3483 post_url = mobj.group(0)
3484 video_id = mobj.group(2)
3486 video_extension = 'flv'
3488 # Step 1, Retrieve post webpage to extract further information
3489 self.report_extract_entry(post_url)
3490 request = compat_urllib_request.Request(post_url)
3492 webpage = compat_urllib_request.urlopen(request).read()
3493 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3494 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3497 # Extract update date
3499 pattern = 'title="Timestamp">(.*?)</a>'
3500 mobj = re.search(pattern, webpage)
3502 upload_date = mobj.group(1)
3503 # Convert timestring to a format suitable for filename
3504 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3505 upload_date = upload_date.strftime('%Y%m%d')
3506 self.report_date(upload_date)
3510 pattern = r'rel\="author".*?>(.*?)</a>'
3511 mobj = re.search(pattern, webpage)
3513 uploader = mobj.group(1)
3514 self.report_uploader(uploader)
3517 # Get the first line for title
3519 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3520 mobj = re.search(pattern, webpage)
3522 video_title = mobj.group(1)
3523 self.report_title(video_title)
3525 # Step 2, Stimulate clicking the image box to launch video
3526 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3527 mobj = re.search(pattern, webpage)
3529 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3531 video_page = mobj.group(1)
3532 request = compat_urllib_request.Request(video_page)
3534 webpage = compat_urllib_request.urlopen(request).read()
3535 except (compat_urllib_error.URLError, httplib.HTTPException, socket.error) as err:
3536 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3538 self.report_extract_vid_page(video_page)
3541 # Extract video links on video page
3542 """Extract video links of all sizes"""
3543 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3544 mobj = re.findall(pattern, webpage)
3546 self._downloader.trouble(u'ERROR: unable to extract video links')
3548 # Sort in resolution
3549 links = sorted(mobj)
3551 # Choose the lowest of the sort, i.e. highest resolution
3552 video_url = links[-1]
3553 # Only get the url. The resolution part in the tuple has no use anymore
3554 video_url = video_url[-1]
3555 # Treat escaped \u0026 style hex
3556 video_url = unicode(video_url, "unicode_escape")
3560 'id': video_id.decode('utf-8'),
3562 'uploader': uploader.decode('utf-8'),
3563 'upload_date': upload_date.decode('utf-8'),
3564 'title': video_title.decode('utf-8'),
3565 'ext': video_extension.decode('utf-8'),