2 # -*- coding: utf-8 -*-
11 import xml.etree.ElementTree
18 class InfoExtractor(object):
19 """Information Extractor class.
21 Information extractors are the classes that, given a URL, extract
22 information about the video (or videos) the URL refers to. This
23 information includes the real video URL, the video title, author and
24 others. The information is stored in a dictionary which is then
25 passed to the FileDownloader. The FileDownloader processes this
26 information possibly downloading the video to the file system, among
27 other possible outcomes.
29 The dictionaries must include the following fields:
33 uploader: Nickname of the video uploader, unescaped.
34 upload_date: Video upload date (YYYYMMDD).
35 title: Video title, unescaped.
36 ext: Video filename extension.
38 The following fields are optional:
40 format: The video format, defaults to ext (used for --get-format)
41 thumbnail: Full URL to a video thumbnail image.
42 description: One-line video description.
43 player_url: SWF Player URL (used for rtmpdump).
44 subtitles: The .srt file contents.
45 urlhandle: [internal] The urlHandle to be used to download the file,
46 like returned by urllib.request.urlopen
48 The fields should all be Unicode strings.
50 Subclasses of this one should re-define the _real_initialize() and
51 _real_extract() methods and define a _VALID_URL regexp.
52 Probably, they should also be added to the list of extractors.
54 _real_extract() must return a *list* of information dictionaries as
57 Finally, the _WORKING attribute should be set to False for broken IEs
58 in order to warn the users and skip the tests.
65 def __init__(self, downloader=None):
66 """Constructor. Receives an optional downloader."""
68 self.set_downloader(downloader)
70 def suitable(self, url):
71 """Receives a URL and returns True if suitable for this IE."""
72 return re.match(self._VALID_URL, url) is not None
75 """Getter method for _WORKING."""
79 """Initializes an instance (authentication, etc)."""
81 self._real_initialize()
84 def extract(self, url):
85 """Extracts URL information and returns it in list of dicts."""
87 return self._real_extract(url)
89 def set_downloader(self, downloader):
90 """Sets the downloader for this IE."""
91 self._downloader = downloader
93 def _real_initialize(self):
94 """Real initialization process. Redefine in subclasses."""
97 def _real_extract(self, url):
98 """Real extraction process. Redefine in subclasses."""
102 class YoutubeIE(InfoExtractor):
103 """Information extractor for youtube.com."""
107 (?:https?://)? # http(s):// (optional)
108 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
109 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
110 (?:.*?\#/)? # handle anchor (#/) redirect urls
111 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
112 (?: # the various things that can precede the ID:
113 (?:(?:v|embed|e)/) # v/ or embed/ or e/
114 |(?: # or the v= param in all its forms
115 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
116 (?:\?|\#!?) # the params delimiter ? or # or #!
117 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
120 )? # optional -> youtube.com/xxxx is OK
121 )? # all until now is optional -> you can pass the naked ID
122 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
123 (?(1).+)? # if we found the ID, everything can follow
125 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
126 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
127 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
128 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
129 _NETRC_MACHINE = 'youtube'
130 # Listed in order of quality
131 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
132 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
133 _video_extensions = {
139 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
145 _video_dimensions = {
163 def suitable(self, url):
164 """Receives a URL and returns True if suitable for this IE."""
165 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
167 def report_lang(self):
168 """Report attempt to set language."""
169 self._downloader.to_screen(u'[youtube] Setting language')
171 def report_login(self):
172 """Report attempt to log in."""
173 self._downloader.to_screen(u'[youtube] Logging in')
175 def report_age_confirmation(self):
176 """Report attempt to confirm age."""
177 self._downloader.to_screen(u'[youtube] Confirming age')
179 def report_video_webpage_download(self, video_id):
180 """Report attempt to download video webpage."""
181 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
183 def report_video_info_webpage_download(self, video_id):
184 """Report attempt to download video info webpage."""
185 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
187 def report_video_subtitles_download(self, video_id):
188 """Report attempt to download video info webpage."""
189 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
191 def report_information_extraction(self, video_id):
192 """Report attempt to extract video information."""
193 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
195 def report_unavailable_format(self, video_id, format):
196 """Report extracted video URL."""
197 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
199 def report_rtmp_download(self):
200 """Indicate the download will use the RTMP protocol."""
201 self._downloader.to_screen(u'[youtube] RTMP download detected')
203 def _closed_captions_xml_to_srt(self, xml_string):
205 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
206 # TODO parse xml instead of regex
207 for n, (start, dur_tag, dur, caption) in enumerate(texts):
208 if not dur: dur = '4'
210 end = start + float(dur)
211 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
212 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
213 caption = unescapeHTML(caption)
214 caption = unescapeHTML(caption) # double cycle, intentional
215 srt += str(n+1) + '\n'
216 srt += start + ' --> ' + end + '\n'
217 srt += caption + '\n\n'
220 def _print_formats(self, formats):
221 print('Available formats:')
223 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
225 def _real_initialize(self):
226 if self._downloader is None:
231 downloader_params = self._downloader.params
233 # Attempt to use provided username and password or .netrc data
234 if downloader_params.get('username', None) is not None:
235 username = downloader_params['username']
236 password = downloader_params['password']
237 elif downloader_params.get('usenetrc', False):
239 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
244 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
245 except (IOError, netrc.NetrcParseError) as err:
246 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
250 request = compat_urllib_request.Request(self._LANG_URL)
253 compat_urllib_request.urlopen(request).read()
254 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
255 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
258 # No authentication to be performed
264 'current_form': 'loginForm',
266 'action_login': 'Log In',
267 'username': username,
268 'password': password,
270 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
273 login_results = compat_urllib_request.urlopen(request).read()
274 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
275 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
277 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
278 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
284 'action_confirm': 'Confirm',
286 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
288 self.report_age_confirmation()
289 age_results = compat_urllib_request.urlopen(request).read()
290 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
291 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
294 def _real_extract(self, url):
295 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
296 mobj = re.search(self._NEXT_URL_RE, url)
298 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
300 # Extract video id from URL
301 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
303 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
305 video_id = mobj.group(2)
308 self.report_video_webpage_download(video_id)
309 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
311 video_webpage = compat_urllib_request.urlopen(request).read()
312 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
313 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
316 # Attempt to extract SWF player URL
317 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
319 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
324 self.report_video_info_webpage_download(video_id)
325 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
326 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
327 % (video_id, el_type))
328 request = compat_urllib_request.Request(video_info_url)
330 video_info_webpage = compat_urllib_request.urlopen(request).read()
331 video_info = compat_parse_qs(video_info_webpage)
332 if 'token' in video_info:
334 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
335 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
337 if 'token' not in video_info:
338 if 'reason' in video_info:
339 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
341 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
344 # Check for "rental" videos
345 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
346 self._downloader.trouble(u'ERROR: "rental" videos not supported')
349 # Start extracting information
350 self.report_information_extraction(video_id)
353 if 'author' not in video_info:
354 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
356 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
359 if 'title' not in video_info:
360 self._downloader.trouble(u'ERROR: unable to extract video title')
362 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
363 video_title = video_title.decode('utf-8')
366 if 'thumbnail_url' not in video_info:
367 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
369 else: # don't panic if we can't find it
370 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
374 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
376 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
377 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
378 for expression in format_expressions:
380 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
385 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
386 if video_description: video_description = clean_html(video_description)
387 else: video_description = ''
390 video_subtitles = None
391 if self._downloader.params.get('writesubtitles', False):
393 self.report_video_subtitles_download(video_id)
394 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
396 srt_list = compat_urllib_request.urlopen(request).read()
397 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
398 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
399 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
400 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
401 if not srt_lang_list:
402 raise Trouble(u'WARNING: video has no closed captions')
403 if self._downloader.params.get('subtitleslang', False):
404 srt_lang = self._downloader.params.get('subtitleslang')
405 elif 'en' in srt_lang_list:
408 srt_lang = srt_lang_list.keys()[0]
409 if not srt_lang in srt_lang_list:
410 raise Trouble(u'WARNING: no closed captions found in the specified language')
411 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
413 srt_xml = compat_urllib_request.urlopen(request).read()
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
417 raise Trouble(u'WARNING: unable to download video subtitles')
418 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
419 except Trouble as trouble:
420 self._downloader.trouble(trouble[0])
422 if 'length_seconds' not in video_info:
423 self._downloader.trouble(u'WARNING: unable to extract video duration')
426 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
429 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
431 # Decide which formats to download
432 req_format = self._downloader.params.get('format', None)
434 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
435 self.report_rtmp_download()
436 video_url_list = [(None, video_info['conn'][0])]
437 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
438 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
439 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
440 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
441 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
443 format_limit = self._downloader.params.get('format_limit', None)
444 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
445 if format_limit is not None and format_limit in available_formats:
446 format_list = available_formats[available_formats.index(format_limit):]
448 format_list = available_formats
449 existing_formats = [x for x in format_list if x in url_map]
450 if len(existing_formats) == 0:
451 self._downloader.trouble(u'ERROR: no known formats available for video')
453 if self._downloader.params.get('listformats', None):
454 self._print_formats(existing_formats)
456 if req_format is None or req_format == 'best':
457 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
458 elif req_format == 'worst':
459 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
460 elif req_format in ('-1', 'all'):
461 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
463 # Specific formats. We pick the first in a slash-delimeted sequence.
464 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
465 req_formats = req_format.split('/')
466 video_url_list = None
467 for rf in req_formats:
469 video_url_list = [(rf, url_map[rf])]
471 if video_url_list is None:
472 self._downloader.trouble(u'ERROR: requested format not available')
475 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
479 for format_param, video_real_url in video_url_list:
481 video_extension = self._video_extensions.get(format_param, 'flv')
483 video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
484 self._video_dimensions.get(format_param, '???'))
487 'id': video_id.decode('utf-8'),
488 'url': video_real_url.decode('utf-8'),
489 'uploader': video_uploader.decode('utf-8'),
490 'upload_date': upload_date,
491 'title': video_title,
492 'ext': video_extension.decode('utf-8'),
493 'format': video_format,
494 'thumbnail': video_thumbnail.decode('utf-8'),
495 'description': video_description,
496 'player_url': player_url,
497 'subtitles': video_subtitles,
498 'duration': video_duration
503 class MetacafeIE(InfoExtractor):
504 """Information Extractor for metacafe.com."""
506 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
507 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
508 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
509 IE_NAME = u'metacafe'
511 def __init__(self, downloader=None):
512 InfoExtractor.__init__(self, downloader)
514 def report_disclaimer(self):
515 """Report disclaimer retrieval."""
516 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
518 def report_age_confirmation(self):
519 """Report attempt to confirm age."""
520 self._downloader.to_screen(u'[metacafe] Confirming age')
522 def report_download_webpage(self, video_id):
523 """Report webpage download."""
524 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
526 def report_extraction(self, video_id):
527 """Report information extraction."""
528 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
530 def _real_initialize(self):
531 # Retrieve disclaimer
532 request = compat_urllib_request.Request(self._DISCLAIMER)
534 self.report_disclaimer()
535 disclaimer = compat_urllib_request.urlopen(request).read()
536 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
537 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
543 'submit': "Continue - I'm over 18",
545 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
547 self.report_age_confirmation()
548 disclaimer = compat_urllib_request.urlopen(request).read()
549 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
550 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
553 def _real_extract(self, url):
554 # Extract id and simplified title from URL
555 mobj = re.match(self._VALID_URL, url)
557 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
560 video_id = mobj.group(1)
562 # Check if video comes from YouTube
563 mobj2 = re.match(r'^yt-(.*)$', video_id)
564 if mobj2 is not None:
565 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
568 # Retrieve video webpage to extract further information
569 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
571 self.report_download_webpage(video_id)
572 webpage = compat_urllib_request.urlopen(request).read()
573 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
574 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
577 # Extract URL, uploader and title from webpage
578 self.report_extraction(video_id)
579 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
581 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
582 video_extension = mediaURL[-3:]
584 # Extract gdaKey if available
585 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
589 gdaKey = mobj.group(1)
590 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
592 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
594 self._downloader.trouble(u'ERROR: unable to extract media URL')
596 vardict = compat_parse_qs(mobj.group(1))
597 if 'mediaData' not in vardict:
598 self._downloader.trouble(u'ERROR: unable to extract media URL')
600 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
602 self._downloader.trouble(u'ERROR: unable to extract media URL')
604 mediaURL = mobj.group(1).replace('\\/', '/')
605 video_extension = mediaURL[-3:]
606 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
608 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
610 self._downloader.trouble(u'ERROR: unable to extract title')
612 video_title = mobj.group(1).decode('utf-8')
614 mobj = re.search(r'submitter=(.*?);', webpage)
616 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
618 video_uploader = mobj.group(1)
621 'id': video_id.decode('utf-8'),
622 'url': video_url.decode('utf-8'),
623 'uploader': video_uploader.decode('utf-8'),
625 'title': video_title,
626 'ext': video_extension.decode('utf-8'),
630 class DailymotionIE(InfoExtractor):
631 """Information Extractor for Dailymotion"""
633 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
634 IE_NAME = u'dailymotion'
636 def __init__(self, downloader=None):
637 InfoExtractor.__init__(self, downloader)
639 def report_download_webpage(self, video_id):
640 """Report webpage download."""
641 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
643 def report_extraction(self, video_id):
644 """Report information extraction."""
645 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
647 def _real_extract(self, url):
648 # Extract id and simplified title from URL
649 mobj = re.match(self._VALID_URL, url)
651 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
654 video_id = mobj.group(1).split('_')[0].split('?')[0]
656 video_extension = 'mp4'
658 # Retrieve video webpage to extract further information
659 request = compat_urllib_request.Request(url)
660 request.add_header('Cookie', 'family_filter=off')
662 self.report_download_webpage(video_id)
663 webpage = compat_urllib_request.urlopen(request).read()
664 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
665 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
668 # Extract URL, uploader and title from webpage
669 self.report_extraction(video_id)
670 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
672 self._downloader.trouble(u'ERROR: unable to extract media URL')
674 flashvars = compat_urllib_parse.unquote(mobj.group(1))
676 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
679 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
682 self._downloader.trouble(u'ERROR: unable to extract video URL')
685 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
687 self._downloader.trouble(u'ERROR: unable to extract video URL')
690 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
692 # TODO: support choosing qualities
694 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
696 self._downloader.trouble(u'ERROR: unable to extract title')
698 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
700 video_uploader = None
701 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
703 # lookin for official user
704 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
705 if mobj_official is None:
706 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
708 video_uploader = mobj_official.group(1)
710 video_uploader = mobj.group(1)
712 video_upload_date = None
713 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
715 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
718 'id': video_id.decode('utf-8'),
719 'url': video_url.decode('utf-8'),
720 'uploader': video_uploader.decode('utf-8'),
721 'upload_date': video_upload_date,
722 'title': video_title,
723 'ext': video_extension.decode('utf-8'),
727 class GoogleIE(InfoExtractor):
728 """Information extractor for video.google.com."""
730 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
731 IE_NAME = u'video.google'
733 def __init__(self, downloader=None):
734 InfoExtractor.__init__(self, downloader)
736 def report_download_webpage(self, video_id):
737 """Report webpage download."""
738 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
740 def report_extraction(self, video_id):
741 """Report information extraction."""
742 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
744 def _real_extract(self, url):
745 # Extract id from URL
746 mobj = re.match(self._VALID_URL, url)
748 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
751 video_id = mobj.group(1)
753 video_extension = 'mp4'
755 # Retrieve video webpage to extract further information
756 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
758 self.report_download_webpage(video_id)
759 webpage = compat_urllib_request.urlopen(request).read()
760 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
761 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
764 # Extract URL, uploader, and title from webpage
765 self.report_extraction(video_id)
766 mobj = re.search(r"download_url:'([^']+)'", webpage)
768 video_extension = 'flv'
769 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
771 self._downloader.trouble(u'ERROR: unable to extract media URL')
773 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
774 mediaURL = mediaURL.replace('\\x3d', '\x3d')
775 mediaURL = mediaURL.replace('\\x26', '\x26')
779 mobj = re.search(r'<title>(.*)</title>', webpage)
781 self._downloader.trouble(u'ERROR: unable to extract title')
783 video_title = mobj.group(1).decode('utf-8')
785 # Extract video description
786 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
788 self._downloader.trouble(u'ERROR: unable to extract video description')
790 video_description = mobj.group(1).decode('utf-8')
791 if not video_description:
792 video_description = 'No description available.'
794 # Extract video thumbnail
795 if self._downloader.params.get('forcethumbnail', False):
796 request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
798 webpage = compat_urllib_request.urlopen(request).read()
799 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
800 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
802 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
804 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
806 video_thumbnail = mobj.group(1)
807 else: # we need something to pass to process_info
811 'id': video_id.decode('utf-8'),
812 'url': video_url.decode('utf-8'),
815 'title': video_title,
816 'ext': video_extension.decode('utf-8'),
820 class PhotobucketIE(InfoExtractor):
821 """Information extractor for photobucket.com."""
823 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
824 IE_NAME = u'photobucket'
826 def __init__(self, downloader=None):
827 InfoExtractor.__init__(self, downloader)
829 def report_download_webpage(self, video_id):
830 """Report webpage download."""
831 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
833 def report_extraction(self, video_id):
834 """Report information extraction."""
835 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
837 def _real_extract(self, url):
838 # Extract id from URL
839 mobj = re.match(self._VALID_URL, url)
841 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
844 video_id = mobj.group(1)
846 video_extension = 'flv'
848 # Retrieve video webpage to extract further information
849 request = compat_urllib_request.Request(url)
851 self.report_download_webpage(video_id)
852 webpage = compat_urllib_request.urlopen(request).read()
853 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
854 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
857 # Extract URL, uploader, and title from webpage
858 self.report_extraction(video_id)
859 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
861 self._downloader.trouble(u'ERROR: unable to extract media URL')
863 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
867 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
869 self._downloader.trouble(u'ERROR: unable to extract title')
871 video_title = mobj.group(1).decode('utf-8')
873 video_uploader = mobj.group(2).decode('utf-8')
876 'id': video_id.decode('utf-8'),
877 'url': video_url.decode('utf-8'),
878 'uploader': video_uploader,
880 'title': video_title,
881 'ext': video_extension.decode('utf-8'),
885 class YahooIE(InfoExtractor):
886 """Information extractor for video.yahoo.com."""
888 # _VALID_URL matches all Yahoo! Video URLs
889 # _VPAGE_URL matches only the extractable '/watch/' URLs
890 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
891 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
892 IE_NAME = u'video.yahoo'
894 def __init__(self, downloader=None):
895 InfoExtractor.__init__(self, downloader)
897 def report_download_webpage(self, video_id):
898 """Report webpage download."""
899 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
901 def report_extraction(self, video_id):
902 """Report information extraction."""
903 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
905 def _real_extract(self, url, new_video=True):
906 # Extract ID from URL
907 mobj = re.match(self._VALID_URL, url)
909 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
912 video_id = mobj.group(2)
913 video_extension = 'flv'
915 # Rewrite valid but non-extractable URLs as
916 # extractable English language /watch/ URLs
917 if re.match(self._VPAGE_URL, url) is None:
918 request = compat_urllib_request.Request(url)
920 webpage = compat_urllib_request.urlopen(request).read()
921 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
922 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
925 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
927 self._downloader.trouble(u'ERROR: Unable to extract id field')
929 yahoo_id = mobj.group(1)
931 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
933 self._downloader.trouble(u'ERROR: Unable to extract vid field')
935 yahoo_vid = mobj.group(1)
937 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
938 return self._real_extract(url, new_video=False)
940 # Retrieve video webpage to extract further information
941 request = compat_urllib_request.Request(url)
943 self.report_download_webpage(video_id)
944 webpage = compat_urllib_request.urlopen(request).read()
945 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
946 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
949 # Extract uploader and title from webpage
950 self.report_extraction(video_id)
951 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
953 self._downloader.trouble(u'ERROR: unable to extract video title')
955 video_title = mobj.group(1).decode('utf-8')
957 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
959 self._downloader.trouble(u'ERROR: unable to extract video uploader')
961 video_uploader = mobj.group(1).decode('utf-8')
963 # Extract video thumbnail
964 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
966 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
968 video_thumbnail = mobj.group(1).decode('utf-8')
970 # Extract video description
971 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
973 self._downloader.trouble(u'ERROR: unable to extract video description')
975 video_description = mobj.group(1).decode('utf-8')
976 if not video_description:
977 video_description = 'No description available.'
979 # Extract video height and width
980 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
982 self._downloader.trouble(u'ERROR: unable to extract video height')
984 yv_video_height = mobj.group(1)
986 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
988 self._downloader.trouble(u'ERROR: unable to extract video width')
990 yv_video_width = mobj.group(1)
992 # Retrieve video playlist to extract media URL
993 # I'm not completely sure what all these options are, but we
994 # seem to need most of them, otherwise the server sends a 401.
995 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
996 yv_bitrate = '700' # according to Wikipedia this is hard-coded
997 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
998 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
999 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1001 self.report_download_webpage(video_id)
1002 webpage = compat_urllib_request.urlopen(request).read()
1003 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1004 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1007 # Extract media URL from playlist XML
1008 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1010 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1012 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1013 video_url = unescapeHTML(video_url)
1016 'id': video_id.decode('utf-8'),
1018 'uploader': video_uploader,
1019 'upload_date': None,
1020 'title': video_title,
1021 'ext': video_extension.decode('utf-8'),
1022 'thumbnail': video_thumbnail.decode('utf-8'),
1023 'description': video_description,
1027 class VimeoIE(InfoExtractor):
1028 """Information extractor for vimeo.com."""
1030 # _VALID_URL matches Vimeo URLs
1031 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1034 def __init__(self, downloader=None):
1035 InfoExtractor.__init__(self, downloader)
1037 def report_download_webpage(self, video_id):
1038 """Report webpage download."""
1039 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1041 def report_extraction(self, video_id):
1042 """Report information extraction."""
1043 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1045 def _real_extract(self, url, new_video=True):
1046 # Extract ID from URL
1047 mobj = re.match(self._VALID_URL, url)
1049 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1052 video_id = mobj.group(1)
1054 # Retrieve video webpage to extract further information
1055 request = compat_urllib_request.Request(url, None, std_headers)
1057 self.report_download_webpage(video_id)
1058 webpage = compat_urllib_request.urlopen(request).read()
1059 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1060 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1063 # Now we begin extracting as much information as we can from what we
1064 # retrieved. First we extract the information common to all extractors,
1065 # and latter we extract those that are Vimeo specific.
1066 self.report_extraction(video_id)
1068 # Extract the config JSON
1069 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1071 config = json.loads(config)
1073 self._downloader.trouble(u'ERROR: unable to extract info section')
1077 video_title = config["video"]["title"]
1080 video_uploader = config["video"]["owner"]["name"]
1082 # Extract video thumbnail
1083 video_thumbnail = config["video"]["thumbnail"]
1085 # Extract video description
1086 video_description = get_element_by_id("description", webpage.decode('utf8'))
1087 if video_description: video_description = clean_html(video_description)
1088 else: video_description = ''
1090 # Extract upload date
1091 video_upload_date = None
1092 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1093 if mobj is not None:
1094 video_upload_date = mobj.group(1)
1096 # Vimeo specific: extract request signature and timestamp
1097 sig = config['request']['signature']
1098 timestamp = config['request']['timestamp']
1100 # Vimeo specific: extract video codec and quality information
1101 # First consider quality, then codecs, then take everything
1102 # TODO bind to format param
1103 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1104 files = { 'hd': [], 'sd': [], 'other': []}
1105 for codec_name, codec_extension in codecs:
1106 if codec_name in config["video"]["files"]:
1107 if 'hd' in config["video"]["files"][codec_name]:
1108 files['hd'].append((codec_name, codec_extension, 'hd'))
1109 elif 'sd' in config["video"]["files"][codec_name]:
1110 files['sd'].append((codec_name, codec_extension, 'sd'))
1112 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1114 for quality in ('hd', 'sd', 'other'):
1115 if len(files[quality]) > 0:
1116 video_quality = files[quality][0][2]
1117 video_codec = files[quality][0][0]
1118 video_extension = files[quality][0][1]
1119 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1122 self._downloader.trouble(u'ERROR: no known codec found')
1125 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1126 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1131 'uploader': video_uploader,
1132 'upload_date': video_upload_date,
1133 'title': video_title,
1134 'ext': video_extension,
1135 'thumbnail': video_thumbnail,
1136 'description': video_description,
1140 class ArteTvIE(InfoExtractor):
1141 """arte.tv information extractor."""
1143 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1144 _LIVE_URL = r'index-[0-9]+\.html$'
1146 IE_NAME = u'arte.tv'
1148 def __init__(self, downloader=None):
1149 InfoExtractor.__init__(self, downloader)
1151 def report_download_webpage(self, video_id):
1152 """Report webpage download."""
1153 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1155 def report_extraction(self, video_id):
1156 """Report information extraction."""
1157 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1159 def fetch_webpage(self, url):
1160 self._downloader.increment_downloads()
1161 request = compat_urllib_request.Request(url)
1163 self.report_download_webpage(url)
1164 webpage = compat_urllib_request.urlopen(request).read()
1165 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1166 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1168 except ValueError as err:
1169 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1173 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1174 page = self.fetch_webpage(url)
1175 mobj = re.search(regex, page, regexFlags)
1179 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1182 for (i, key, err) in matchTuples:
1183 if mobj.group(i) is None:
1184 self._downloader.trouble(err)
1187 info[key] = mobj.group(i)
1191 def extractLiveStream(self, url):
1192 video_lang = url.split('/')[-4]
1193 info = self.grep_webpage(
1195 r'src="(.*?/videothek_js.*?\.js)',
1198 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1201 http_host = url.split('/')[2]
1202 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1203 info = self.grep_webpage(
1205 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1206 '(http://.*?\.swf).*?' +
1210 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1211 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1212 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1215 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1217 def extractPlus7Stream(self, url):
1218 video_lang = url.split('/')[-3]
1219 info = self.grep_webpage(
1221 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1224 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1227 next_url = compat_urllib_parse.unquote(info.get('url'))
1228 info = self.grep_webpage(
1230 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1233 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1236 next_url = compat_urllib_parse.unquote(info.get('url'))
1238 info = self.grep_webpage(
1240 r'<video id="(.*?)".*?>.*?' +
1241 '<name>(.*?)</name>.*?' +
1242 '<dateVideo>(.*?)</dateVideo>.*?' +
1243 '<url quality="hd">(.*?)</url>',
1246 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1247 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1248 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1249 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1254 'id': info.get('id'),
1255 'url': compat_urllib_parse.unquote(info.get('url')),
1256 'uploader': u'arte.tv',
1257 'upload_date': info.get('date'),
1258 'title': info.get('title'),
1264 def _real_extract(self, url):
1265 video_id = url.split('/')[-1]
1266 self.report_extraction(video_id)
1268 if re.search(self._LIVE_URL, video_id) is not None:
1269 self.extractLiveStream(url)
1272 info = self.extractPlus7Stream(url)
1277 class GenericIE(InfoExtractor):
1278 """Generic last-resort information extractor."""
1281 IE_NAME = u'generic'
1283 def __init__(self, downloader=None):
1284 InfoExtractor.__init__(self, downloader)
1286 def report_download_webpage(self, video_id):
1287 """Report webpage download."""
1288 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1289 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1291 def report_extraction(self, video_id):
1292 """Report information extraction."""
1293 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1295 def report_following_redirect(self, new_url):
1296 """Report information extraction."""
1297 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1299 def _test_redirect(self, url):
1300 """Check if it is a redirect, like url shorteners, in case restart chain."""
1301 class HeadRequest(compat_urllib_request.Request):
1302 def get_method(self):
1305 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1307 Subclass the HTTPRedirectHandler to make it use our
1308 HeadRequest also on the redirected URL
1310 def redirect_request(self, req, fp, code, msg, headers, newurl):
1311 if code in (301, 302, 303, 307):
1312 newurl = newurl.replace(' ', '%20')
1313 newheaders = dict((k,v) for k,v in req.headers.items()
1314 if k.lower() not in ("content-length", "content-type"))
1315 return HeadRequest(newurl,
1317 origin_req_host=req.get_origin_req_host(),
1320 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1322 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1324 Fallback to GET if HEAD is not allowed (405 HTTP error)
1326 def http_error_405(self, req, fp, code, msg, headers):
1330 newheaders = dict((k,v) for k,v in req.headers.items()
1331 if k.lower() not in ("content-length", "content-type"))
1332 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1334 origin_req_host=req.get_origin_req_host(),
1338 opener = compat_urllib_request.OpenerDirector()
1339 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1340 HTTPMethodFallback, HEADRedirectHandler,
1341 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1342 opener.add_handler(handler())
1344 response = opener.open(HeadRequest(url))
1345 new_url = response.geturl()
1350 self.report_following_redirect(new_url)
1351 self._downloader.download([new_url])
1354 def _real_extract(self, url):
1355 if self._test_redirect(url): return
1357 video_id = url.split('/')[-1]
1358 request = compat_urllib_request.Request(url)
1360 self.report_download_webpage(video_id)
1361 webpage = compat_urllib_request.urlopen(request).read()
1362 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1363 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1365 except ValueError as err:
1366 # since this is the last-resort InfoExtractor, if
1367 # this error is thrown, it'll be thrown here
1368 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1371 self.report_extraction(video_id)
1372 # Start with something easy: JW Player in SWFObject
1373 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1375 # Broaden the search a little bit
1376 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1378 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1381 # It's possible that one of the regexes
1382 # matched, but returned an empty group:
1383 if mobj.group(1) is None:
1384 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1387 video_url = compat_urllib_parse.unquote(mobj.group(1))
1388 video_id = os.path.basename(video_url)
1390 # here's a fun little line of code for you:
1391 video_extension = os.path.splitext(video_id)[1][1:]
1392 video_id = os.path.splitext(video_id)[0]
1394 # it's tempting to parse this further, but you would
1395 # have to take into account all the variations like
1396 # Video Title - Site Name
1397 # Site Name | Video Title
1398 # Video Title - Tagline | Site Name
1399 # and so on and so forth; it's just not practical
1400 mobj = re.search(r'<title>(.*)</title>', webpage)
1402 self._downloader.trouble(u'ERROR: unable to extract title')
1404 video_title = mobj.group(1).decode('utf-8')
1406 # video uploader is domain name
1407 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1409 self._downloader.trouble(u'ERROR: unable to extract title')
1411 video_uploader = mobj.group(1).decode('utf-8')
1414 'id': video_id.decode('utf-8'),
1415 'url': video_url.decode('utf-8'),
1416 'uploader': video_uploader,
1417 'upload_date': None,
1418 'title': video_title,
1419 'ext': video_extension.decode('utf-8'),
1423 class YoutubeSearchIE(InfoExtractor):
1424 """Information Extractor for YouTube search queries."""
1425 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1426 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1427 _max_youtube_results = 1000
1428 IE_NAME = u'youtube:search'
1430 def __init__(self, downloader=None):
1431 InfoExtractor.__init__(self, downloader)
1433 def report_download_page(self, query, pagenum):
1434 """Report attempt to download search page with given number."""
1435 query = query.decode(preferredencoding())
1436 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1438 def _real_extract(self, query):
1439 mobj = re.match(self._VALID_URL, query)
1441 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1444 prefix, query = query.split(':')
1446 query = query.encode('utf-8')
1448 self._download_n_results(query, 1)
1450 elif prefix == 'all':
1451 self._download_n_results(query, self._max_youtube_results)
1457 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1459 elif n > self._max_youtube_results:
1460 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1461 n = self._max_youtube_results
1462 self._download_n_results(query, n)
1464 except ValueError: # parsing prefix as integer fails
1465 self._download_n_results(query, 1)
1468 def _download_n_results(self, query, n):
1469 """Downloads a specified number of results for a query"""
1475 while (50 * pagenum) < limit:
1476 self.report_download_page(query, pagenum+1)
1477 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1478 request = compat_urllib_request.Request(result_url)
1480 data = compat_urllib_request.urlopen(request).read()
1481 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1482 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1484 api_response = json.loads(data)['data']
1486 new_ids = list(video['id'] for video in api_response['items'])
1487 video_ids += new_ids
1489 limit = min(n, api_response['totalItems'])
1492 if len(video_ids) > n:
1493 video_ids = video_ids[:n]
1494 for id in video_ids:
1495 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1499 class GoogleSearchIE(InfoExtractor):
1500 """Information Extractor for Google Video search queries."""
1501 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1502 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1503 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1504 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1505 _max_google_results = 1000
1506 IE_NAME = u'video.google:search'
1508 def __init__(self, downloader=None):
1509 InfoExtractor.__init__(self, downloader)
1511 def report_download_page(self, query, pagenum):
1512 """Report attempt to download playlist page with given number."""
1513 query = query.decode(preferredencoding())
1514 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1516 def _real_extract(self, query):
1517 mobj = re.match(self._VALID_URL, query)
1519 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1522 prefix, query = query.split(':')
1524 query = query.encode('utf-8')
1526 self._download_n_results(query, 1)
1528 elif prefix == 'all':
1529 self._download_n_results(query, self._max_google_results)
1535 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1537 elif n > self._max_google_results:
1538 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1539 n = self._max_google_results
1540 self._download_n_results(query, n)
1542 except ValueError: # parsing prefix as integer fails
1543 self._download_n_results(query, 1)
1546 def _download_n_results(self, query, n):
1547 """Downloads a specified number of results for a query"""
1553 self.report_download_page(query, pagenum)
1554 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1555 request = compat_urllib_request.Request(result_url)
1557 page = compat_urllib_request.urlopen(request).read()
1558 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1559 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1562 # Extract video identifiers
1563 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1564 video_id = mobj.group(1)
1565 if video_id not in video_ids:
1566 video_ids.append(video_id)
1567 if len(video_ids) == n:
1568 # Specified n videos reached
1569 for id in video_ids:
1570 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1573 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1574 for id in video_ids:
1575 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1578 pagenum = pagenum + 1
1581 class YahooSearchIE(InfoExtractor):
1582 """Information Extractor for Yahoo! Video search queries."""
1583 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1584 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1585 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1586 _MORE_PAGES_INDICATOR = r'\s*Next'
1587 _max_yahoo_results = 1000
1588 IE_NAME = u'video.yahoo:search'
1590 def __init__(self, downloader=None):
1591 InfoExtractor.__init__(self, downloader)
1593 def report_download_page(self, query, pagenum):
1594 """Report attempt to download playlist page with given number."""
1595 query = query.decode(preferredencoding())
1596 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1598 def _real_extract(self, query):
1599 mobj = re.match(self._VALID_URL, query)
1601 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1604 prefix, query = query.split(':')
1606 query = query.encode('utf-8')
1608 self._download_n_results(query, 1)
1610 elif prefix == 'all':
1611 self._download_n_results(query, self._max_yahoo_results)
1617 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1619 elif n > self._max_yahoo_results:
1620 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1621 n = self._max_yahoo_results
1622 self._download_n_results(query, n)
1624 except ValueError: # parsing prefix as integer fails
1625 self._download_n_results(query, 1)
1628 def _download_n_results(self, query, n):
1629 """Downloads a specified number of results for a query"""
1632 already_seen = set()
1636 self.report_download_page(query, pagenum)
1637 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1638 request = compat_urllib_request.Request(result_url)
1640 page = compat_urllib_request.urlopen(request).read()
1641 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1642 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1645 # Extract video identifiers
1646 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1647 video_id = mobj.group(1)
1648 if video_id not in already_seen:
1649 video_ids.append(video_id)
1650 already_seen.add(video_id)
1651 if len(video_ids) == n:
1652 # Specified n videos reached
1653 for id in video_ids:
1654 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1657 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1658 for id in video_ids:
1659 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1662 pagenum = pagenum + 1
1665 class YoutubePlaylistIE(InfoExtractor):
1666 """Information Extractor for YouTube playlists."""
1668 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1669 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1670 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1671 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1672 IE_NAME = u'youtube:playlist'
1674 def __init__(self, downloader=None):
1675 InfoExtractor.__init__(self, downloader)
1677 def report_download_page(self, playlist_id, pagenum):
1678 """Report attempt to download playlist page with given number."""
1679 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1681 def _real_extract(self, url):
1682 # Extract playlist id
1683 mobj = re.match(self._VALID_URL, url)
1685 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1689 if mobj.group(3) is not None:
1690 self._downloader.download([mobj.group(3)])
1693 # Download playlist pages
1694 # prefix is 'p' as default for playlists but there are other types that need extra care
1695 playlist_prefix = mobj.group(1)
1696 if playlist_prefix == 'a':
1697 playlist_access = 'artist'
1699 playlist_prefix = 'p'
1700 playlist_access = 'view_play_list'
1701 playlist_id = mobj.group(2)
1706 self.report_download_page(playlist_id, pagenum)
1707 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1708 request = compat_urllib_request.Request(url)
1710 page = compat_urllib_request.urlopen(request).read()
1711 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1712 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1715 # Extract video identifiers
1717 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1718 if mobj.group(1) not in ids_in_page:
1719 ids_in_page.append(mobj.group(1))
1720 video_ids.extend(ids_in_page)
1722 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1724 pagenum = pagenum + 1
1726 playliststart = self._downloader.params.get('playliststart', 1) - 1
1727 playlistend = self._downloader.params.get('playlistend', -1)
1728 if playlistend == -1:
1729 video_ids = video_ids[playliststart:]
1731 video_ids = video_ids[playliststart:playlistend]
1733 for id in video_ids:
1734 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1738 class YoutubeChannelIE(InfoExtractor):
1739 """Information Extractor for YouTube channels."""
1741 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1742 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1743 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1744 IE_NAME = u'youtube:channel'
1746 def report_download_page(self, channel_id, pagenum):
1747 """Report attempt to download channel page with given number."""
1748 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1750 def _real_extract(self, url):
1751 # Extract channel id
1752 mobj = re.match(self._VALID_URL, url)
1754 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1757 # Download channel pages
1758 channel_id = mobj.group(1)
1763 self.report_download_page(channel_id, pagenum)
1764 url = self._TEMPLATE_URL % (channel_id, pagenum)
1765 request = compat_urllib_request.Request(url)
1767 page = compat_urllib_request.urlopen(request).read()
1768 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1769 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1772 # Extract video identifiers
1774 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1775 if mobj.group(1) not in ids_in_page:
1776 ids_in_page.append(mobj.group(1))
1777 video_ids.extend(ids_in_page)
1779 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1781 pagenum = pagenum + 1
1783 for id in video_ids:
1784 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1788 class YoutubeUserIE(InfoExtractor):
1789 """Information Extractor for YouTube users."""
1791 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1792 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1793 _GDATA_PAGE_SIZE = 50
1794 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1795 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1796 IE_NAME = u'youtube:user'
1798 def __init__(self, downloader=None):
1799 InfoExtractor.__init__(self, downloader)
1801 def report_download_page(self, username, start_index):
1802 """Report attempt to download user page."""
1803 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1804 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1806 def _real_extract(self, url):
1808 mobj = re.match(self._VALID_URL, url)
1810 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1813 username = mobj.group(1)
1815 # Download video ids using YouTube Data API. Result size per
1816 # query is limited (currently to 50 videos) so we need to query
1817 # page by page until there are no video ids - it means we got
1824 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1825 self.report_download_page(username, start_index)
1827 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1830 page = compat_urllib_request.urlopen(request).read()
1831 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1832 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1835 # Extract video identifiers
1838 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1839 if mobj.group(1) not in ids_in_page:
1840 ids_in_page.append(mobj.group(1))
1842 video_ids.extend(ids_in_page)
1844 # A little optimization - if current page is not
1845 # "full", ie. does not contain PAGE_SIZE video ids then
1846 # we can assume that this page is the last one - there
1847 # are no more ids on further pages - no need to query
1850 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1855 all_ids_count = len(video_ids)
1856 playliststart = self._downloader.params.get('playliststart', 1) - 1
1857 playlistend = self._downloader.params.get('playlistend', -1)
1859 if playlistend == -1:
1860 video_ids = video_ids[playliststart:]
1862 video_ids = video_ids[playliststart:playlistend]
1864 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1865 (username, all_ids_count, len(video_ids)))
1867 for video_id in video_ids:
1868 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1871 class BlipTVUserIE(InfoExtractor):
1872 """Information Extractor for blip.tv users."""
1874 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1876 IE_NAME = u'blip.tv:user'
1878 def __init__(self, downloader=None):
1879 InfoExtractor.__init__(self, downloader)
1881 def report_download_page(self, username, pagenum):
1882 """Report attempt to download user page."""
1883 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1884 (self.IE_NAME, username, pagenum))
1886 def _real_extract(self, url):
1888 mobj = re.match(self._VALID_URL, url)
1890 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1893 username = mobj.group(1)
1895 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1897 request = compat_urllib_request.Request(url)
1900 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1901 mobj = re.search(r'data-users-id="([^"]+)"', page)
1902 page_base = page_base % mobj.group(1)
1903 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1904 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1908 # Download video ids using BlipTV Ajax calls. Result size per
1909 # query is limited (currently to 12 videos) so we need to query
1910 # page by page until there are no video ids - it means we got
1917 self.report_download_page(username, pagenum)
1919 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1922 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1923 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1924 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1927 # Extract video identifiers
1930 for mobj in re.finditer(r'href="/([^"]+)"', page):
1931 if mobj.group(1) not in ids_in_page:
1932 ids_in_page.append(unescapeHTML(mobj.group(1)))
1934 video_ids.extend(ids_in_page)
1936 # A little optimization - if current page is not
1937 # "full", ie. does not contain PAGE_SIZE video ids then
1938 # we can assume that this page is the last one - there
1939 # are no more ids on further pages - no need to query
1942 if len(ids_in_page) < self._PAGE_SIZE:
1947 all_ids_count = len(video_ids)
1948 playliststart = self._downloader.params.get('playliststart', 1) - 1
1949 playlistend = self._downloader.params.get('playlistend', -1)
1951 if playlistend == -1:
1952 video_ids = video_ids[playliststart:]
1954 video_ids = video_ids[playliststart:playlistend]
1956 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1957 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1959 for video_id in video_ids:
1960 self._downloader.download([u'http://blip.tv/'+video_id])
1963 class DepositFilesIE(InfoExtractor):
1964 """Information extractor for depositfiles.com"""
1966 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1967 IE_NAME = u'DepositFiles'
1969 def __init__(self, downloader=None):
1970 InfoExtractor.__init__(self, downloader)
1972 def report_download_webpage(self, file_id):
1973 """Report webpage download."""
1974 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1976 def report_extraction(self, file_id):
1977 """Report information extraction."""
1978 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1980 def _real_extract(self, url):
1981 file_id = url.split('/')[-1]
1982 # Rebuild url in english locale
1983 url = 'http://depositfiles.com/en/files/' + file_id
1985 # Retrieve file webpage with 'Free download' button pressed
1986 free_download_indication = { 'gateway_result' : '1' }
1987 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1989 self.report_download_webpage(file_id)
1990 webpage = compat_urllib_request.urlopen(request).read()
1991 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1992 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1995 # Search for the real file URL
1996 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1997 if (mobj is None) or (mobj.group(1) is None):
1998 # Try to figure out reason of the error.
1999 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2000 if (mobj is not None) and (mobj.group(1) is not None):
2001 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2002 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2004 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2007 file_url = mobj.group(1)
2008 file_extension = os.path.splitext(file_url)[1][1:]
2010 # Search for file title
2011 mobj = re.search(r'<b title="(.*?)">', webpage)
2013 self._downloader.trouble(u'ERROR: unable to extract title')
2015 file_title = mobj.group(1).decode('utf-8')
2018 'id': file_id.decode('utf-8'),
2019 'url': file_url.decode('utf-8'),
2021 'upload_date': None,
2022 'title': file_title,
2023 'ext': file_extension.decode('utf-8'),
2027 class FacebookIE(InfoExtractor):
2028 """Information Extractor for Facebook"""
2031 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2032 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2033 _NETRC_MACHINE = 'facebook'
2034 _available_formats = ['video', 'highqual', 'lowqual']
2035 _video_extensions = {
2040 IE_NAME = u'facebook'
2042 def __init__(self, downloader=None):
2043 InfoExtractor.__init__(self, downloader)
2045 def _reporter(self, message):
2046 """Add header and report message."""
2047 self._downloader.to_screen(u'[facebook] %s' % message)
2049 def report_login(self):
2050 """Report attempt to log in."""
2051 self._reporter(u'Logging in')
2053 def report_video_webpage_download(self, video_id):
2054 """Report attempt to download video webpage."""
2055 self._reporter(u'%s: Downloading video webpage' % video_id)
2057 def report_information_extraction(self, video_id):
2058 """Report attempt to extract video information."""
2059 self._reporter(u'%s: Extracting video information' % video_id)
2061 def _parse_page(self, video_webpage):
2062 """Extract video information from page"""
2064 data = {'title': r'\("video_title", "(.*?)"\)',
2065 'description': r'<div class="datawrap">(.*?)</div>',
2066 'owner': r'\("video_owner_name", "(.*?)"\)',
2067 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2070 for piece in data.keys():
2071 mobj = re.search(data[piece], video_webpage)
2072 if mobj is not None:
2073 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2077 for fmt in self._available_formats:
2078 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2079 if mobj is not None:
2080 # URL is in a Javascript segment inside an escaped Unicode format within
2081 # the generally utf-8 page
2082 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2083 video_info['video_urls'] = video_urls
2087 def _real_initialize(self):
2088 if self._downloader is None:
2093 downloader_params = self._downloader.params
2095 # Attempt to use provided username and password or .netrc data
2096 if downloader_params.get('username', None) is not None:
2097 useremail = downloader_params['username']
2098 password = downloader_params['password']
2099 elif downloader_params.get('usenetrc', False):
2101 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2102 if info is not None:
2106 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2107 except (IOError, netrc.NetrcParseError) as err:
2108 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2111 if useremail is None:
2120 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2123 login_results = compat_urllib_request.urlopen(request).read()
2124 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2125 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2127 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2128 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2131 def _real_extract(self, url):
2132 mobj = re.match(self._VALID_URL, url)
2134 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2136 video_id = mobj.group('ID')
2139 self.report_video_webpage_download(video_id)
2140 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2142 page = compat_urllib_request.urlopen(request)
2143 video_webpage = page.read()
2144 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2145 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2148 # Start extracting information
2149 self.report_information_extraction(video_id)
2151 # Extract information
2152 video_info = self._parse_page(video_webpage)
2155 if 'owner' not in video_info:
2156 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2158 video_uploader = video_info['owner']
2161 if 'title' not in video_info:
2162 self._downloader.trouble(u'ERROR: unable to extract video title')
2164 video_title = video_info['title']
2165 video_title = video_title.decode('utf-8')
2168 if 'thumbnail' not in video_info:
2169 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2170 video_thumbnail = ''
2172 video_thumbnail = video_info['thumbnail']
2176 if 'upload_date' in video_info:
2177 upload_time = video_info['upload_date']
2178 timetuple = email.utils.parsedate_tz(upload_time)
2179 if timetuple is not None:
2181 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2186 video_description = video_info.get('description', 'No description available.')
2188 url_map = video_info['video_urls']
2189 if len(url_map.keys()) > 0:
2190 # Decide which formats to download
2191 req_format = self._downloader.params.get('format', None)
2192 format_limit = self._downloader.params.get('format_limit', None)
2194 if format_limit is not None and format_limit in self._available_formats:
2195 format_list = self._available_formats[self._available_formats.index(format_limit):]
2197 format_list = self._available_formats
2198 existing_formats = [x for x in format_list if x in url_map]
2199 if len(existing_formats) == 0:
2200 self._downloader.trouble(u'ERROR: no known formats available for video')
2202 if req_format is None:
2203 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2204 elif req_format == 'worst':
2205 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2206 elif req_format == '-1':
2207 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2210 if req_format not in url_map:
2211 self._downloader.trouble(u'ERROR: requested format not available')
2213 video_url_list = [(req_format, url_map[req_format])] # Specific format
2216 for format_param, video_real_url in video_url_list:
2218 video_extension = self._video_extensions.get(format_param, 'mp4')
2221 'id': video_id.decode('utf-8'),
2222 'url': video_real_url.decode('utf-8'),
2223 'uploader': video_uploader.decode('utf-8'),
2224 'upload_date': upload_date,
2225 'title': video_title,
2226 'ext': video_extension.decode('utf-8'),
2227 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2228 'thumbnail': video_thumbnail.decode('utf-8'),
2229 'description': video_description.decode('utf-8'),
2233 class BlipTVIE(InfoExtractor):
2234 """Information extractor for blip.tv"""
2236 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2237 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2238 IE_NAME = u'blip.tv'
2240 def report_extraction(self, file_id):
2241 """Report information extraction."""
2242 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2244 def report_direct_download(self, title):
2245 """Report information extraction."""
2246 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2248 def _real_extract(self, url):
2249 mobj = re.match(self._VALID_URL, url)
2251 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2258 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2259 request = compat_urllib_request.Request(json_url.encode('utf-8'))
2260 self.report_extraction(mobj.group(1))
2263 urlh = compat_urllib_request.urlopen(request)
2264 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2265 basename = url.split('/')[-1]
2266 title,ext = os.path.splitext(basename)
2267 title = title.decode('UTF-8')
2268 ext = ext.replace('.', '')
2269 self.report_direct_download(title)
2274 'upload_date': None,
2279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2280 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2282 if info is None: # Regular URL
2284 json_code = urlh.read()
2285 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2286 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2290 json_data = json.loads(json_code)
2291 if 'Post' in json_data:
2292 data = json_data['Post']
2296 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2297 video_url = data['media']['url']
2298 umobj = re.match(self._URL_EXT, video_url)
2300 raise ValueError('Can not determine filename extension')
2301 ext = umobj.group(1)
2304 'id': data['item_id'],
2306 'uploader': data['display_name'],
2307 'upload_date': upload_date,
2308 'title': data['title'],
2310 'format': data['media']['mimeType'],
2311 'thumbnail': data['thumbnailUrl'],
2312 'description': data['description'],
2313 'player_url': data['embedUrl']
2315 except (ValueError,KeyError) as err:
2316 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2319 std_headers['User-Agent'] = 'iTunes/10.6.1'
2323 class MyVideoIE(InfoExtractor):
2324 """Information Extractor for myvideo.de."""
2326 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2327 IE_NAME = u'myvideo'
2329 def __init__(self, downloader=None):
2330 InfoExtractor.__init__(self, downloader)
2332 def report_download_webpage(self, video_id):
2333 """Report webpage download."""
2334 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2336 def report_extraction(self, video_id):
2337 """Report information extraction."""
2338 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2340 def _real_extract(self,url):
2341 mobj = re.match(self._VALID_URL, url)
2343 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2346 video_id = mobj.group(1)
2349 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2351 self.report_download_webpage(video_id)
2352 webpage = compat_urllib_request.urlopen(request).read()
2353 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2354 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2357 self.report_extraction(video_id)
2358 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2361 self._downloader.trouble(u'ERROR: unable to extract media URL')
2363 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2365 mobj = re.search('<title>([^<]+)</title>', webpage)
2367 self._downloader.trouble(u'ERROR: unable to extract title')
2370 video_title = mobj.group(1)
2376 'upload_date': None,
2377 'title': video_title,
2381 class ComedyCentralIE(InfoExtractor):
2382 """Information extractor for The Daily Show and Colbert Report """
2384 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2385 IE_NAME = u'comedycentral'
2387 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2389 _video_extensions = {
2397 _video_dimensions = {
2406 def report_extraction(self, episode_id):
2407 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2409 def report_config_download(self, episode_id):
2410 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2412 def report_index_download(self, episode_id):
2413 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2415 def report_player_url(self, episode_id):
2416 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2419 def _print_formats(self, formats):
2420 print('Available formats:')
2422 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2425 def _real_extract(self, url):
2426 mobj = re.match(self._VALID_URL, url)
2428 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2431 if mobj.group('shortname'):
2432 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2433 url = u'http://www.thedailyshow.com/full-episodes/'
2435 url = u'http://www.colbertnation.com/full-episodes/'
2436 mobj = re.match(self._VALID_URL, url)
2437 assert mobj is not None
2439 dlNewest = not mobj.group('episode')
2441 epTitle = mobj.group('showname')
2443 epTitle = mobj.group('episode')
2445 req = compat_urllib_request.Request(url)
2446 self.report_extraction(epTitle)
2448 htmlHandle = compat_urllib_request.urlopen(req)
2449 html = htmlHandle.read()
2450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2451 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2454 url = htmlHandle.geturl()
2455 mobj = re.match(self._VALID_URL, url)
2457 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2459 if mobj.group('episode') == '':
2460 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2462 epTitle = mobj.group('episode')
2464 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2466 if len(mMovieParams) == 0:
2467 # The Colbert Report embeds the information in a without
2468 # a URL prefix; so extract the alternate reference
2469 # and then add the URL prefix manually.
2471 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2472 if len(altMovieParams) == 0:
2473 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2476 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2478 playerUrl_raw = mMovieParams[0][0]
2479 self.report_player_url(epTitle)
2481 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2482 playerUrl = urlHandle.geturl()
2483 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2484 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2487 uri = mMovieParams[0][1]
2488 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2489 self.report_index_download(epTitle)
2491 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2492 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2493 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2498 idoc = xml.etree.ElementTree.fromstring(indexXml)
2499 itemEls = idoc.findall('.//item')
2500 for itemEl in itemEls:
2501 mediaId = itemEl.findall('./guid')[0].text
2502 shortMediaId = mediaId.split(':')[-1]
2503 showId = mediaId.split(':')[-2].replace('.com', '')
2504 officialTitle = itemEl.findall('./title')[0].text
2505 officialDate = itemEl.findall('./pubDate')[0].text
2507 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2508 compat_urllib_parse.urlencode({'uri': mediaId}))
2509 configReq = compat_urllib_request.Request(configUrl)
2510 self.report_config_download(epTitle)
2512 configXml = compat_urllib_request.urlopen(configReq).read()
2513 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2514 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2517 cdoc = xml.etree.ElementTree.fromstring(configXml)
2519 for rendition in cdoc.findall('.//rendition'):
2520 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2524 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2527 if self._downloader.params.get('listformats', None):
2528 self._print_formats([i[0] for i in turls])
2531 # For now, just pick the highest bitrate
2532 format,video_url = turls[-1]
2534 # Get the format arg from the arg stream
2535 req_format = self._downloader.params.get('format', None)
2537 # Select format if we can find one
2540 format, video_url = f, v
2543 # Patch to download from alternative CDN, which does not
2544 # break on current RTMPDump builds
2545 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2546 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2548 if video_url.startswith(broken_cdn):
2549 video_url = video_url.replace(broken_cdn, better_cdn)
2551 effTitle = showId + u'-' + epTitle
2556 'upload_date': officialDate,
2561 'description': officialTitle,
2562 'player_url': None #playerUrl
2565 results.append(info)
2570 class EscapistIE(InfoExtractor):
2571 """Information extractor for The Escapist """
2573 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2574 IE_NAME = u'escapist'
2576 def report_extraction(self, showName):
2577 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2579 def report_config_download(self, showName):
2580 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2582 def _real_extract(self, url):
2583 mobj = re.match(self._VALID_URL, url)
2585 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2587 showName = mobj.group('showname')
2588 videoId = mobj.group('episode')
2590 self.report_extraction(showName)
2592 webPage = compat_urllib_request.urlopen(url)
2593 webPageBytes = webPage.read()
2594 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2595 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2596 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2597 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2600 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2601 description = unescapeHTML(descMatch.group(1))
2602 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2603 imgUrl = unescapeHTML(imgMatch.group(1))
2604 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2605 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2606 configUrlMatch = re.search('config=(.*)$', playerUrl)
2607 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2609 self.report_config_download(showName)
2611 configJSON = compat_urllib_request.urlopen(configUrl).read()
2612 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2613 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2616 # Technically, it's JavaScript, not JSON
2617 configJSON = configJSON.replace("'", '"')
2620 config = json.loads(configJSON)
2621 except (ValueError,) as err:
2622 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2625 playlist = config['playlist']
2626 videoUrl = playlist[1]['url']
2631 'uploader': showName,
2632 'upload_date': None,
2635 'thumbnail': imgUrl,
2636 'description': description,
2637 'player_url': playerUrl,
2643 class CollegeHumorIE(InfoExtractor):
2644 """Information extractor for collegehumor.com"""
2646 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2647 IE_NAME = u'collegehumor'
2649 def report_webpage(self, video_id):
2650 """Report information extraction."""
2651 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2653 def report_extraction(self, video_id):
2654 """Report information extraction."""
2655 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2657 def _real_extract(self, url):
2658 mobj = re.match(self._VALID_URL, url)
2660 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2662 video_id = mobj.group('videoid')
2664 self.report_webpage(video_id)
2665 request = compat_urllib_request.Request(url)
2667 webpage = compat_urllib_request.urlopen(request).read()
2668 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2669 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2672 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2674 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2676 internal_video_id = m.group('internalvideoid')
2680 'internal_id': internal_video_id,
2682 'upload_date': None,
2685 self.report_extraction(video_id)
2686 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2688 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2689 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2690 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2693 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2695 videoNode = mdoc.findall('./video')[0]
2696 info['description'] = videoNode.findall('./description')[0].text
2697 info['title'] = videoNode.findall('./caption')[0].text
2698 info['url'] = videoNode.findall('./file')[0].text
2699 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2700 info['ext'] = info['url'].rpartition('.')[2]
2702 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2708 class XVideosIE(InfoExtractor):
2709 """Information extractor for xvideos.com"""
2711 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2712 IE_NAME = u'xvideos'
2714 def report_webpage(self, video_id):
2715 """Report information extraction."""
2716 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2718 def report_extraction(self, video_id):
2719 """Report information extraction."""
2720 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2722 def _real_extract(self, url):
2723 mobj = re.match(self._VALID_URL, url)
2725 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2727 video_id = mobj.group(1).decode('utf-8')
2729 self.report_webpage(video_id)
2731 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2733 webpage = compat_urllib_request.urlopen(request).read()
2734 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2735 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2738 self.report_extraction(video_id)
2742 mobj = re.search(r'flv_url=(.+?)&', webpage)
2744 self._downloader.trouble(u'ERROR: unable to extract video url')
2746 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2750 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2752 self._downloader.trouble(u'ERROR: unable to extract video title')
2754 video_title = mobj.group(1).decode('utf-8')
2757 # Extract video thumbnail
2758 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2760 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2762 video_thumbnail = mobj.group(0).decode('utf-8')
2768 'upload_date': None,
2769 'title': video_title,
2771 'thumbnail': video_thumbnail,
2772 'description': None,
2778 class SoundcloudIE(InfoExtractor):
2779 """Information extractor for soundcloud.com
2780 To access the media, the uid of the song and a stream token
2781 must be extracted from the page source and the script must make
2782 a request to media.soundcloud.com/crossdomain.xml. Then
2783 the media can be grabbed by requesting from an url composed
2784 of the stream token and uid
2787 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2788 IE_NAME = u'soundcloud'
2790 def __init__(self, downloader=None):
2791 InfoExtractor.__init__(self, downloader)
2793 def report_webpage(self, video_id):
2794 """Report information extraction."""
2795 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2797 def report_extraction(self, video_id):
2798 """Report information extraction."""
2799 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2801 def _real_extract(self, url):
2802 mobj = re.match(self._VALID_URL, url)
2804 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2807 # extract uploader (which is in the url)
2808 uploader = mobj.group(1).decode('utf-8')
2809 # extract simple title (uploader + slug of song title)
2810 slug_title = mobj.group(2).decode('utf-8')
2811 simple_title = uploader + u'-' + slug_title
2813 self.report_webpage('%s/%s' % (uploader, slug_title))
2815 request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2817 webpage = compat_urllib_request.urlopen(request).read()
2818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2819 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2822 self.report_extraction('%s/%s' % (uploader, slug_title))
2824 # extract uid and stream token that soundcloud hands out for access
2825 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2827 video_id = mobj.group(1)
2828 stream_token = mobj.group(2)
2830 # extract unsimplified title
2831 mobj = re.search('"title":"(.*?)",', webpage)
2833 title = mobj.group(1).decode('utf-8')
2835 title = simple_title
2837 # construct media url (with uid/token)
2838 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2839 mediaURL = mediaURL % (video_id, stream_token)
2842 description = u'No description available'
2843 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2845 description = mobj.group(1)
2849 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2852 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2853 except Exception as err:
2854 self._downloader.to_stderr(compat_str(err))
2856 # for soundcloud, a request to a cross domain is required for cookies
2857 request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2860 'id': video_id.decode('utf-8'),
2862 'uploader': uploader.decode('utf-8'),
2863 'upload_date': upload_date,
2866 'description': description.decode('utf-8')
2870 class InfoQIE(InfoExtractor):
2871 """Information extractor for infoq.com"""
2873 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2876 def report_webpage(self, video_id):
2877 """Report information extraction."""
2878 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2880 def report_extraction(self, video_id):
2881 """Report information extraction."""
2882 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2884 def _real_extract(self, url):
2885 mobj = re.match(self._VALID_URL, url)
2887 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2890 self.report_webpage(url)
2892 request = compat_urllib_request.Request(url)
2894 webpage = compat_urllib_request.urlopen(request).read()
2895 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2896 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2899 self.report_extraction(url)
2903 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2905 self._downloader.trouble(u'ERROR: unable to extract video url')
2907 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2911 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2913 self._downloader.trouble(u'ERROR: unable to extract video title')
2915 video_title = mobj.group(1).decode('utf-8')
2917 # Extract description
2918 video_description = u'No description available.'
2919 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2920 if mobj is not None:
2921 video_description = mobj.group(1).decode('utf-8')
2923 video_filename = video_url.split('/')[-1]
2924 video_id, extension = video_filename.split('.')
2930 'upload_date': None,
2931 'title': video_title,
2932 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2934 'description': video_description,
2939 class MixcloudIE(InfoExtractor):
2940 """Information extractor for www.mixcloud.com"""
2941 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2942 IE_NAME = u'mixcloud'
2944 def __init__(self, downloader=None):
2945 InfoExtractor.__init__(self, downloader)
2947 def report_download_json(self, file_id):
2948 """Report JSON download."""
2949 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2951 def report_extraction(self, file_id):
2952 """Report information extraction."""
2953 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2955 def get_urls(self, jsonData, fmt, bitrate='best'):
2956 """Get urls from 'audio_formats' section in json"""
2959 bitrate_list = jsonData[fmt]
2960 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2961 bitrate = max(bitrate_list) # select highest
2963 url_list = jsonData[fmt][bitrate]
2964 except TypeError: # we have no bitrate info.
2965 url_list = jsonData[fmt]
2968 def check_urls(self, url_list):
2969 """Returns 1st active url from list"""
2970 for url in url_list:
2972 compat_urllib_request.urlopen(url)
2974 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2979 def _print_formats(self, formats):
2980 print('Available formats:')
2981 for fmt in formats.keys():
2982 for b in formats[fmt]:
2984 ext = formats[fmt][b][0]
2985 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2986 except TypeError: # we have no bitrate info
2987 ext = formats[fmt][0]
2988 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2991 def _real_extract(self, url):
2992 mobj = re.match(self._VALID_URL, url)
2994 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2996 # extract uploader & filename from url
2997 uploader = mobj.group(1).decode('utf-8')
2998 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3000 # construct API request
3001 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3002 # retrieve .json file with links to files
3003 request = compat_urllib_request.Request(file_url)
3005 self.report_download_json(file_url)
3006 jsonData = compat_urllib_request.urlopen(request).read()
3007 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3008 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3012 json_data = json.loads(jsonData)
3013 player_url = json_data['player_swf_url']
3014 formats = dict(json_data['audio_formats'])
3016 req_format = self._downloader.params.get('format', None)
3019 if self._downloader.params.get('listformats', None):
3020 self._print_formats(formats)
3023 if req_format is None or req_format == 'best':
3024 for format_param in formats.keys():
3025 url_list = self.get_urls(formats, format_param)
3027 file_url = self.check_urls(url_list)
3028 if file_url is not None:
3031 if req_format not in formats.keys():
3032 self._downloader.trouble(u'ERROR: format is not available')
3035 url_list = self.get_urls(formats, req_format)
3036 file_url = self.check_urls(url_list)
3037 format_param = req_format
3040 'id': file_id.decode('utf-8'),
3041 'url': file_url.decode('utf-8'),
3042 'uploader': uploader.decode('utf-8'),
3043 'upload_date': None,
3044 'title': json_data['name'],
3045 'ext': file_url.split('.')[-1].decode('utf-8'),
3046 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3047 'thumbnail': json_data['thumbnail_url'],
3048 'description': json_data['description'],
3049 'player_url': player_url.decode('utf-8'),
3052 class StanfordOpenClassroomIE(InfoExtractor):
3053 """Information extractor for Stanford's Open ClassRoom"""
3055 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3056 IE_NAME = u'stanfordoc'
3058 def report_download_webpage(self, objid):
3059 """Report information extraction."""
3060 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3062 def report_extraction(self, video_id):
3063 """Report information extraction."""
3064 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3066 def _real_extract(self, url):
3067 mobj = re.match(self._VALID_URL, url)
3069 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3072 if mobj.group('course') and mobj.group('video'): # A specific video
3073 course = mobj.group('course')
3074 video = mobj.group('video')
3076 'id': course + '_' + video,
3078 'upload_date': None,
3081 self.report_extraction(info['id'])
3082 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3083 xmlUrl = baseUrl + video + '.xml'
3085 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3086 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3087 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3089 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3091 info['title'] = mdoc.findall('./title')[0].text
3092 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3094 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3096 info['ext'] = info['url'].rpartition('.')[2]
3098 elif mobj.group('course'): # A course page
3099 course = mobj.group('course')
3104 'upload_date': None,
3107 self.report_download_webpage(info['id'])
3109 coursepage = compat_urllib_request.urlopen(url).read()
3110 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3111 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3114 m = re.search('<h1>([^<]+)</h1>', coursepage)
3116 info['title'] = unescapeHTML(m.group(1))
3118 info['title'] = info['id']
3120 m = re.search('<description>([^<]+)</description>', coursepage)
3122 info['description'] = unescapeHTML(m.group(1))
3124 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3127 'type': 'reference',
3128 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3132 for entry in info['list']:
3133 assert entry['type'] == 'reference'
3134 results += self.extract(entry['url'])
3139 'id': 'Stanford OpenClassroom',
3142 'upload_date': None,
3145 self.report_download_webpage(info['id'])
3146 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3148 rootpage = compat_urllib_request.urlopen(rootURL).read()
3149 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3150 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3153 info['title'] = info['id']
3155 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3158 'type': 'reference',
3159 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3164 for entry in info['list']:
3165 assert entry['type'] == 'reference'
3166 results += self.extract(entry['url'])
3169 class MTVIE(InfoExtractor):
3170 """Information extractor for MTV.com"""
3172 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3175 def report_webpage(self, video_id):
3176 """Report information extraction."""
3177 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3179 def report_extraction(self, video_id):
3180 """Report information extraction."""
3181 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3183 def _real_extract(self, url):
3184 mobj = re.match(self._VALID_URL, url)
3186 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3188 if not mobj.group('proto'):
3189 url = 'http://' + url
3190 video_id = mobj.group('videoid')
3191 self.report_webpage(video_id)
3193 request = compat_urllib_request.Request(url)
3195 webpage = compat_urllib_request.urlopen(request).read()
3196 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3197 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3200 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3202 self._downloader.trouble(u'ERROR: unable to extract song name')
3204 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3205 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3207 self._downloader.trouble(u'ERROR: unable to extract performer')
3209 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3210 video_title = performer + ' - ' + song_name
3212 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3214 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3216 mtvn_uri = mobj.group(1)
3218 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3220 self._downloader.trouble(u'ERROR: unable to extract content id')
3222 content_id = mobj.group(1)
3224 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3225 self.report_extraction(video_id)
3226 request = compat_urllib_request.Request(videogen_url)
3228 metadataXml = compat_urllib_request.urlopen(request).read()
3229 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3230 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3233 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3234 renditions = mdoc.findall('.//rendition')
3236 # For now, always pick the highest quality.
3237 rendition = renditions[-1]
3240 _,_,ext = rendition.attrib['type'].partition('/')
3241 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3242 video_url = rendition.find('./src').text
3244 self._downloader.trouble('Invalid rendition field.')
3250 'uploader': performer,
3251 'upload_date': None,
3252 'title': video_title,
3260 class YoukuIE(InfoExtractor):
3262 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3265 def __init__(self, downloader=None):
3266 InfoExtractor.__init__(self, downloader)
3268 def report_download_webpage(self, file_id):
3269 """Report webpage download."""
3270 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3272 def report_extraction(self, file_id):
3273 """Report information extraction."""
3274 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3277 nowTime = int(time.time() * 1000)
3278 random1 = random.randint(1000,1998)
3279 random2 = random.randint(1000,9999)
3281 return "%d%d%d" %(nowTime,random1,random2)
3283 def _get_file_ID_mix_string(self, seed):
3285 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3287 for i in range(len(source)):
3288 seed = (seed * 211 + 30031 ) % 65536
3289 index = math.floor(seed / 65536 * len(source) )
3290 mixed.append(source[int(index)])
3291 source.remove(source[int(index)])
3292 #return ''.join(mixed)
3295 def _get_file_id(self, fileId, seed):
3296 mixed = self._get_file_ID_mix_string(seed)
3297 ids = fileId.split('*')
3301 realId.append(mixed[int(ch)])
3302 return ''.join(realId)
3304 def _real_extract(self, url):
3305 mobj = re.match(self._VALID_URL, url)
3307 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3309 video_id = mobj.group('ID')
3311 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3313 request = compat_urllib_request.Request(info_url, None, std_headers)
3315 self.report_download_webpage(video_id)
3316 jsondata = compat_urllib_request.urlopen(request).read()
3317 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3318 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3321 self.report_extraction(video_id)
3323 config = json.loads(jsondata)
3325 video_title = config['data'][0]['title']
3326 seed = config['data'][0]['seed']
3328 format = self._downloader.params.get('format', None)
3329 supported_format = config['data'][0]['streamfileids'].keys()
3331 if format is None or format == 'best':
3332 if 'hd2' in supported_format:
3337 elif format == 'worst':
3345 fileid = config['data'][0]['streamfileids'][format]
3346 seg_number = len(config['data'][0]['segs'][format])
3349 for i in xrange(seg_number):
3350 keys.append(config['data'][0]['segs'][format][i]['k'])
3353 #youku only could be viewed from mainland china
3355 self._downloader.trouble(u'ERROR: unable to extract info section')
3359 sid = self._gen_sid()
3360 fileid = self._get_file_id(fileid, seed)
3362 #column 8,9 of fileid represent the segment number
3363 #fileid[7:9] should be changed
3364 for index, key in enumerate(keys):
3366 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3367 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3370 'id': '%s_part%02d' % (video_id, index),
3371 'url': download_url,
3373 'upload_date': None,
3374 'title': video_title,
3377 files_info.append(info)
3382 class XNXXIE(InfoExtractor):
3383 """Information extractor for xnxx.com"""
3385 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3387 VIDEO_URL_RE = r'flv_url=(.*?)&'
3388 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3389 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3391 def report_webpage(self, video_id):
3392 """Report information extraction"""
3393 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3395 def report_extraction(self, video_id):
3396 """Report information extraction"""
3397 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3399 def _real_extract(self, url):
3400 mobj = re.match(self._VALID_URL, url)
3402 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3404 video_id = mobj.group(1).decode('utf-8')
3406 self.report_webpage(video_id)
3408 # Get webpage content
3410 webpage = compat_urllib_request.urlopen(url).read()
3411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3412 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3415 result = re.search(self.VIDEO_URL_RE, webpage)
3417 self._downloader.trouble(u'ERROR: unable to extract video url')
3419 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3421 result = re.search(self.VIDEO_TITLE_RE, webpage)
3423 self._downloader.trouble(u'ERROR: unable to extract video title')
3425 video_title = result.group(1).decode('utf-8')
3427 result = re.search(self.VIDEO_THUMB_RE, webpage)
3429 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3431 video_thumbnail = result.group(1).decode('utf-8')
3437 'upload_date': None,
3438 'title': video_title,
3440 'thumbnail': video_thumbnail,
3441 'description': None,
3445 class GooglePlusIE(InfoExtractor):
3446 """Information extractor for plus.google.com."""
3448 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3449 IE_NAME = u'plus.google'
3451 def __init__(self, downloader=None):
3452 InfoExtractor.__init__(self, downloader)
3454 def report_extract_entry(self, url):
3455 """Report downloading extry"""
3456 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3458 def report_date(self, upload_date):
3459 """Report downloading extry"""
3460 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3462 def report_uploader(self, uploader):
3463 """Report downloading extry"""
3464 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3466 def report_title(self, video_title):
3467 """Report downloading extry"""
3468 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3470 def report_extract_vid_page(self, video_page):
3471 """Report information extraction."""
3472 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3474 def _real_extract(self, url):
3475 # Extract id from URL
3476 mobj = re.match(self._VALID_URL, url)
3478 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3481 post_url = mobj.group(0)
3482 video_id = mobj.group(2)
3484 video_extension = 'flv'
3486 # Step 1, Retrieve post webpage to extract further information
3487 self.report_extract_entry(post_url)
3488 request = compat_urllib_request.Request(post_url)
3490 webpage = compat_urllib_request.urlopen(request).read()
3491 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3492 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3495 # Extract update date
3497 pattern = 'title="Timestamp">(.*?)</a>'
3498 mobj = re.search(pattern, webpage)
3500 upload_date = mobj.group(1)
3501 # Convert timestring to a format suitable for filename
3502 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3503 upload_date = upload_date.strftime('%Y%m%d')
3504 self.report_date(upload_date)
3508 pattern = r'rel\="author".*?>(.*?)</a>'
3509 mobj = re.search(pattern, webpage)
3511 uploader = mobj.group(1)
3512 self.report_uploader(uploader)
3515 # Get the first line for title
3517 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3518 mobj = re.search(pattern, webpage)
3520 video_title = mobj.group(1)
3521 self.report_title(video_title)
3523 # Step 2, Stimulate clicking the image box to launch video
3524 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3525 mobj = re.search(pattern, webpage)
3527 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3529 video_page = mobj.group(1)
3530 request = compat_urllib_request.Request(video_page)
3532 webpage = compat_urllib_request.urlopen(request).read()
3533 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3534 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3536 self.report_extract_vid_page(video_page)
3539 # Extract video links on video page
3540 """Extract video links of all sizes"""
3541 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3542 mobj = re.findall(pattern, webpage)
3544 self._downloader.trouble(u'ERROR: unable to extract video links')
3546 # Sort in resolution
3547 links = sorted(mobj)
3549 # Choose the lowest of the sort, i.e. highest resolution
3550 video_url = links[-1]
3551 # Only get the url. The resolution part in the tuple has no use anymore
3552 video_url = video_url[-1]
3553 # Treat escaped \u0026 style hex
3554 video_url = unicode(video_url, "unicode_escape")
3558 'id': video_id.decode('utf-8'),
3560 'uploader': uploader.decode('utf-8'),
3561 'upload_date': upload_date.decode('utf-8'),
3562 'title': video_title.decode('utf-8'),
3563 'ext': video_extension.decode('utf-8'),