2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
19 import cStringIO as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
40 uploader: Nickname of the video uploader.
42 ext: Video filename extension.
44 player_url: SWF Player URL (may be None).
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
62 def __init__(self, downloader=None):
63 """Constructor. Receives an optional downloader."""
65 self.set_downloader(downloader)
67 def suitable(self, url):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re.match(self._VALID_URL, url) is not None
72 """Initializes an instance (authentication, etc)."""
74 self._real_initialize()
77 def extract(self, url):
78 """Extracts URL information and returns it in list of dicts."""
80 return self._real_extract(url)
82 def set_downloader(self, downloader):
83 """Sets the downloader for this IE."""
84 self._downloader = downloader
86 def _real_initialize(self):
87 """Real initialization process. Redefine in subclasses."""
90 def _real_extract(self, url):
91 """Real extraction process. Redefine in subclasses."""
95 class YoutubeIE(InfoExtractor):
96 """Information extractor for youtube.com."""
98 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
103 _NETRC_MACHINE = 'youtube'
104 # Listed in order of quality
105 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
107 _video_extensions = {
113 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
119 _video_dimensions = {
137 def report_lang(self):
138 """Report attempt to set language."""
139 self._downloader.to_screen(u'[youtube] Setting language')
141 def report_login(self):
142 """Report attempt to log in."""
143 self._downloader.to_screen(u'[youtube] Logging in')
145 def report_age_confirmation(self):
146 """Report attempt to confirm age."""
147 self._downloader.to_screen(u'[youtube] Confirming age')
149 def report_video_webpage_download(self, video_id):
150 """Report attempt to download video webpage."""
151 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
153 def report_video_info_webpage_download(self, video_id):
154 """Report attempt to download video info webpage."""
155 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
157 def report_video_subtitles_download(self, video_id):
158 """Report attempt to download video info webpage."""
159 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
161 def report_information_extraction(self, video_id):
162 """Report attempt to extract video information."""
163 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
165 def report_unavailable_format(self, video_id, format):
166 """Report extracted video URL."""
167 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
169 def report_rtmp_download(self):
170 """Indicate the download will use the RTMP protocol."""
171 self._downloader.to_screen(u'[youtube] RTMP download detected')
173 def _closed_captions_xml_to_srt(self, xml_string):
175 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
176 # TODO parse xml instead of regex
177 for n, (start, dur_tag, dur, caption) in enumerate(texts):
178 if not dur: dur = '4'
180 end = start + float(dur)
181 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
182 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
183 caption = unescapeHTML(caption)
184 caption = unescapeHTML(caption) # double cycle, intentional
185 srt += str(n+1) + '\n'
186 srt += start + ' --> ' + end + '\n'
187 srt += caption + '\n\n'
190 def _print_formats(self, formats):
191 print 'Available formats:'
193 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
195 def _real_initialize(self):
196 if self._downloader is None:
201 downloader_params = self._downloader.params
203 # Attempt to use provided username and password or .netrc data
204 if downloader_params.get('username', None) is not None:
205 username = downloader_params['username']
206 password = downloader_params['password']
207 elif downloader_params.get('usenetrc', False):
209 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
214 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
215 except (IOError, netrc.NetrcParseError), err:
216 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
220 request = urllib2.Request(self._LANG_URL)
223 urllib2.urlopen(request).read()
224 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
225 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
228 # No authentication to be performed
234 'current_form': 'loginForm',
236 'action_login': 'Log In',
237 'username': username,
238 'password': password,
240 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
243 login_results = urllib2.urlopen(request).read()
244 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
245 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
248 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
254 'action_confirm': 'Confirm',
256 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
258 self.report_age_confirmation()
259 age_results = urllib2.urlopen(request).read()
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
264 def _real_extract(self, url):
265 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266 mobj = re.search(self._NEXT_URL_RE, url)
268 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
270 # Extract video id from URL
271 mobj = re.match(self._VALID_URL, url)
273 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
275 video_id = mobj.group(2)
278 self.report_video_webpage_download(video_id)
279 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
281 video_webpage = urllib2.urlopen(request).read()
282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
283 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
286 # Attempt to extract SWF player URL
287 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
289 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
294 self.report_video_info_webpage_download(video_id)
295 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297 % (video_id, el_type))
298 request = urllib2.Request(video_info_url)
300 video_info_webpage = urllib2.urlopen(request).read()
301 video_info = parse_qs(video_info_webpage)
302 if 'token' in video_info:
304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
307 if 'token' not in video_info:
308 if 'reason' in video_info:
309 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
311 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
314 # Check for "rental" videos
315 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
316 self._downloader.trouble(u'ERROR: "rental" videos not supported')
319 # Start extracting information
320 self.report_information_extraction(video_id)
323 if 'author' not in video_info:
324 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
326 video_uploader = urllib.unquote_plus(video_info['author'][0])
329 if 'title' not in video_info:
330 self._downloader.trouble(u'ERROR: unable to extract video title')
332 video_title = urllib.unquote_plus(video_info['title'][0])
333 video_title = video_title.decode('utf-8')
336 if 'thumbnail_url' not in video_info:
337 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
339 else: # don't panic if we can't find it
340 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
344 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
346 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
347 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
348 for expression in format_expressions:
350 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
355 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
356 if video_description: video_description = clean_html(video_description)
357 else: video_description = ''
360 video_subtitles = None
361 if self._downloader.params.get('writesubtitles', False):
363 self.report_video_subtitles_download(video_id)
364 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
366 srt_list = urllib2.urlopen(request).read()
367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
368 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
369 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
370 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
371 if not srt_lang_list:
372 raise Trouble(u'WARNING: video has no closed captions')
373 if self._downloader.params.get('subtitleslang', False):
374 srt_lang = self._downloader.params.get('subtitleslang')
375 elif 'en' in srt_lang_list:
378 srt_lang = srt_lang_list.keys()[0]
379 if not srt_lang in srt_lang_list:
380 raise Trouble(u'WARNING: no closed captions found in the specified language')
381 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
383 srt_xml = urllib2.urlopen(request).read()
384 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
385 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
387 raise Trouble(u'WARNING: unable to download video subtitles')
388 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
389 except Trouble as trouble:
390 self._downloader.trouble(trouble[0])
393 video_token = urllib.unquote_plus(video_info['token'][0])
395 # Decide which formats to download
396 req_format = self._downloader.params.get('format', None)
398 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
399 self.report_rtmp_download()
400 video_url_list = [(None, video_info['conn'][0])]
401 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
402 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
403 url_data = [parse_qs(uds) for uds in url_data_strs]
404 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
405 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
407 format_limit = self._downloader.params.get('format_limit', None)
408 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
409 if format_limit is not None and format_limit in available_formats:
410 format_list = available_formats[available_formats.index(format_limit):]
412 format_list = available_formats
413 existing_formats = [x for x in format_list if x in url_map]
414 if len(existing_formats) == 0:
415 self._downloader.trouble(u'ERROR: no known formats available for video')
417 if self._downloader.params.get('listformats', None):
418 self._print_formats(existing_formats)
420 if req_format is None or req_format == 'best':
421 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
422 elif req_format == 'worst':
423 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
424 elif req_format in ('-1', 'all'):
425 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
427 # Specific formats. We pick the first in a slash-delimeted sequence.
428 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
429 req_formats = req_format.split('/')
430 video_url_list = None
431 for rf in req_formats:
433 video_url_list = [(rf, url_map[rf])]
435 if video_url_list is None:
436 self._downloader.trouble(u'ERROR: requested format not available')
439 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
443 for format_param, video_real_url in video_url_list:
445 video_extension = self._video_extensions.get(format_param, 'flv')
448 'id': video_id.decode('utf-8'),
449 'url': video_real_url.decode('utf-8'),
450 'uploader': video_uploader.decode('utf-8'),
451 'upload_date': upload_date,
452 'title': video_title,
453 'ext': video_extension.decode('utf-8'),
454 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
455 'thumbnail': video_thumbnail.decode('utf-8'),
456 'description': video_description,
457 'player_url': player_url,
458 'subtitles': video_subtitles
463 class MetacafeIE(InfoExtractor):
464 """Information Extractor for metacafe.com."""
466 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
467 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
468 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
469 IE_NAME = u'metacafe'
471 def __init__(self, downloader=None):
472 InfoExtractor.__init__(self, downloader)
474 def report_disclaimer(self):
475 """Report disclaimer retrieval."""
476 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
478 def report_age_confirmation(self):
479 """Report attempt to confirm age."""
480 self._downloader.to_screen(u'[metacafe] Confirming age')
482 def report_download_webpage(self, video_id):
483 """Report webpage download."""
484 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
486 def report_extraction(self, video_id):
487 """Report information extraction."""
488 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
490 def _real_initialize(self):
491 # Retrieve disclaimer
492 request = urllib2.Request(self._DISCLAIMER)
494 self.report_disclaimer()
495 disclaimer = urllib2.urlopen(request).read()
496 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
497 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
503 'submit': "Continue - I'm over 18",
505 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
507 self.report_age_confirmation()
508 disclaimer = urllib2.urlopen(request).read()
509 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
510 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
513 def _real_extract(self, url):
514 # Extract id and simplified title from URL
515 mobj = re.match(self._VALID_URL, url)
517 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
520 video_id = mobj.group(1)
522 # Check if video comes from YouTube
523 mobj2 = re.match(r'^yt-(.*)$', video_id)
524 if mobj2 is not None:
525 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
528 # Retrieve video webpage to extract further information
529 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
531 self.report_download_webpage(video_id)
532 webpage = urllib2.urlopen(request).read()
533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
537 # Extract URL, uploader and title from webpage
538 self.report_extraction(video_id)
539 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
541 mediaURL = urllib.unquote(mobj.group(1))
542 video_extension = mediaURL[-3:]
544 # Extract gdaKey if available
545 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
549 gdaKey = mobj.group(1)
550 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
552 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
554 self._downloader.trouble(u'ERROR: unable to extract media URL')
556 vardict = parse_qs(mobj.group(1))
557 if 'mediaData' not in vardict:
558 self._downloader.trouble(u'ERROR: unable to extract media URL')
560 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
562 self._downloader.trouble(u'ERROR: unable to extract media URL')
564 mediaURL = mobj.group(1).replace('\\/', '/')
565 video_extension = mediaURL[-3:]
566 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
568 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
570 self._downloader.trouble(u'ERROR: unable to extract title')
572 video_title = mobj.group(1).decode('utf-8')
574 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
576 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
578 video_uploader = mobj.group(1)
581 'id': video_id.decode('utf-8'),
582 'url': video_url.decode('utf-8'),
583 'uploader': video_uploader.decode('utf-8'),
584 'upload_date': u'NA',
585 'title': video_title,
586 'ext': video_extension.decode('utf-8'),
592 class DailymotionIE(InfoExtractor):
593 """Information Extractor for Dailymotion"""
595 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
596 IE_NAME = u'dailymotion'
598 def __init__(self, downloader=None):
599 InfoExtractor.__init__(self, downloader)
601 def report_download_webpage(self, video_id):
602 """Report webpage download."""
603 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
605 def report_extraction(self, video_id):
606 """Report information extraction."""
607 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
609 def _real_extract(self, url):
610 # Extract id and simplified title from URL
611 mobj = re.match(self._VALID_URL, url)
613 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
616 video_id = mobj.group(1)
618 video_extension = 'flv'
620 # Retrieve video webpage to extract further information
621 request = urllib2.Request(url)
622 request.add_header('Cookie', 'family_filter=off')
624 self.report_download_webpage(video_id)
625 webpage = urllib2.urlopen(request).read()
626 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
627 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
630 # Extract URL, uploader and title from webpage
631 self.report_extraction(video_id)
632 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
634 self._downloader.trouble(u'ERROR: unable to extract media URL')
636 sequence = urllib.unquote(mobj.group(1))
637 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
639 self._downloader.trouble(u'ERROR: unable to extract media URL')
641 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
643 # if needed add http://www.dailymotion.com/ if relative URL
647 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
649 self._downloader.trouble(u'ERROR: unable to extract title')
651 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
653 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
655 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
657 video_uploader = mobj.group(1)
660 'id': video_id.decode('utf-8'),
661 'url': video_url.decode('utf-8'),
662 'uploader': video_uploader.decode('utf-8'),
663 'upload_date': u'NA',
664 'title': video_title,
665 'ext': video_extension.decode('utf-8'),
671 class GoogleIE(InfoExtractor):
672 """Information extractor for video.google.com."""
674 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
675 IE_NAME = u'video.google'
677 def __init__(self, downloader=None):
678 InfoExtractor.__init__(self, downloader)
680 def report_download_webpage(self, video_id):
681 """Report webpage download."""
682 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
684 def report_extraction(self, video_id):
685 """Report information extraction."""
686 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
688 def _real_extract(self, url):
689 # Extract id from URL
690 mobj = re.match(self._VALID_URL, url)
692 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
695 video_id = mobj.group(1)
697 video_extension = 'mp4'
699 # Retrieve video webpage to extract further information
700 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
702 self.report_download_webpage(video_id)
703 webpage = urllib2.urlopen(request).read()
704 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
705 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
708 # Extract URL, uploader, and title from webpage
709 self.report_extraction(video_id)
710 mobj = re.search(r"download_url:'([^']+)'", webpage)
712 video_extension = 'flv'
713 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
715 self._downloader.trouble(u'ERROR: unable to extract media URL')
717 mediaURL = urllib.unquote(mobj.group(1))
718 mediaURL = mediaURL.replace('\\x3d', '\x3d')
719 mediaURL = mediaURL.replace('\\x26', '\x26')
723 mobj = re.search(r'<title>(.*)</title>', webpage)
725 self._downloader.trouble(u'ERROR: unable to extract title')
727 video_title = mobj.group(1).decode('utf-8')
729 # Extract video description
730 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
732 self._downloader.trouble(u'ERROR: unable to extract video description')
734 video_description = mobj.group(1).decode('utf-8')
735 if not video_description:
736 video_description = 'No description available.'
738 # Extract video thumbnail
739 if self._downloader.params.get('forcethumbnail', False):
740 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
742 webpage = urllib2.urlopen(request).read()
743 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
744 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
746 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
748 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
750 video_thumbnail = mobj.group(1)
751 else: # we need something to pass to process_info
755 'id': video_id.decode('utf-8'),
756 'url': video_url.decode('utf-8'),
758 'upload_date': u'NA',
759 'title': video_title,
760 'ext': video_extension.decode('utf-8'),
766 class PhotobucketIE(InfoExtractor):
767 """Information extractor for photobucket.com."""
769 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
770 IE_NAME = u'photobucket'
772 def __init__(self, downloader=None):
773 InfoExtractor.__init__(self, downloader)
775 def report_download_webpage(self, video_id):
776 """Report webpage download."""
777 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
779 def report_extraction(self, video_id):
780 """Report information extraction."""
781 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
783 def _real_extract(self, url):
784 # Extract id from URL
785 mobj = re.match(self._VALID_URL, url)
787 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
790 video_id = mobj.group(1)
792 video_extension = 'flv'
794 # Retrieve video webpage to extract further information
795 request = urllib2.Request(url)
797 self.report_download_webpage(video_id)
798 webpage = urllib2.urlopen(request).read()
799 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
800 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
803 # Extract URL, uploader, and title from webpage
804 self.report_extraction(video_id)
805 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
807 self._downloader.trouble(u'ERROR: unable to extract media URL')
809 mediaURL = urllib.unquote(mobj.group(1))
813 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
815 self._downloader.trouble(u'ERROR: unable to extract title')
817 video_title = mobj.group(1).decode('utf-8')
819 video_uploader = mobj.group(2).decode('utf-8')
822 'id': video_id.decode('utf-8'),
823 'url': video_url.decode('utf-8'),
824 'uploader': video_uploader,
825 'upload_date': u'NA',
826 'title': video_title,
827 'ext': video_extension.decode('utf-8'),
833 class YahooIE(InfoExtractor):
834 """Information extractor for video.yahoo.com."""
836 # _VALID_URL matches all Yahoo! Video URLs
837 # _VPAGE_URL matches only the extractable '/watch/' URLs
838 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
839 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
840 IE_NAME = u'video.yahoo'
842 def __init__(self, downloader=None):
843 InfoExtractor.__init__(self, downloader)
845 def report_download_webpage(self, video_id):
846 """Report webpage download."""
847 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
849 def report_extraction(self, video_id):
850 """Report information extraction."""
851 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
853 def _real_extract(self, url, new_video=True):
854 # Extract ID from URL
855 mobj = re.match(self._VALID_URL, url)
857 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
860 video_id = mobj.group(2)
861 video_extension = 'flv'
863 # Rewrite valid but non-extractable URLs as
864 # extractable English language /watch/ URLs
865 if re.match(self._VPAGE_URL, url) is None:
866 request = urllib2.Request(url)
868 webpage = urllib2.urlopen(request).read()
869 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
870 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
873 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
875 self._downloader.trouble(u'ERROR: Unable to extract id field')
877 yahoo_id = mobj.group(1)
879 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
881 self._downloader.trouble(u'ERROR: Unable to extract vid field')
883 yahoo_vid = mobj.group(1)
885 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
886 return self._real_extract(url, new_video=False)
888 # Retrieve video webpage to extract further information
889 request = urllib2.Request(url)
891 self.report_download_webpage(video_id)
892 webpage = urllib2.urlopen(request).read()
893 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
894 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
897 # Extract uploader and title from webpage
898 self.report_extraction(video_id)
899 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
901 self._downloader.trouble(u'ERROR: unable to extract video title')
903 video_title = mobj.group(1).decode('utf-8')
905 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
907 self._downloader.trouble(u'ERROR: unable to extract video uploader')
909 video_uploader = mobj.group(1).decode('utf-8')
911 # Extract video thumbnail
912 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
914 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
916 video_thumbnail = mobj.group(1).decode('utf-8')
918 # Extract video description
919 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
921 self._downloader.trouble(u'ERROR: unable to extract video description')
923 video_description = mobj.group(1).decode('utf-8')
924 if not video_description:
925 video_description = 'No description available.'
927 # Extract video height and width
928 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
930 self._downloader.trouble(u'ERROR: unable to extract video height')
932 yv_video_height = mobj.group(1)
934 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
936 self._downloader.trouble(u'ERROR: unable to extract video width')
938 yv_video_width = mobj.group(1)
940 # Retrieve video playlist to extract media URL
941 # I'm not completely sure what all these options are, but we
942 # seem to need most of them, otherwise the server sends a 401.
943 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
944 yv_bitrate = '700' # according to Wikipedia this is hard-coded
945 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
946 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
947 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
949 self.report_download_webpage(video_id)
950 webpage = urllib2.urlopen(request).read()
951 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
952 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
955 # Extract media URL from playlist XML
956 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
958 self._downloader.trouble(u'ERROR: Unable to extract media URL')
960 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
961 video_url = unescapeHTML(video_url)
964 'id': video_id.decode('utf-8'),
966 'uploader': video_uploader,
967 'upload_date': u'NA',
968 'title': video_title,
969 'ext': video_extension.decode('utf-8'),
970 'thumbnail': video_thumbnail.decode('utf-8'),
971 'description': video_description,
972 'thumbnail': video_thumbnail,
977 class VimeoIE(InfoExtractor):
978 """Information extractor for vimeo.com."""
980 # _VALID_URL matches Vimeo URLs
981 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
984 def __init__(self, downloader=None):
985 InfoExtractor.__init__(self, downloader)
987 def report_download_webpage(self, video_id):
988 """Report webpage download."""
989 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
991 def report_extraction(self, video_id):
992 """Report information extraction."""
993 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
995 def _real_extract(self, url, new_video=True):
996 # Extract ID from URL
997 mobj = re.match(self._VALID_URL, url)
999 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1002 video_id = mobj.group(1)
1004 # Retrieve video webpage to extract further information
1005 request = urllib2.Request(url, None, std_headers)
1007 self.report_download_webpage(video_id)
1008 webpage = urllib2.urlopen(request).read()
1009 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1010 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1013 # Now we begin extracting as much information as we can from what we
1014 # retrieved. First we extract the information common to all extractors,
1015 # and latter we extract those that are Vimeo specific.
1016 self.report_extraction(video_id)
1018 # Extract the config JSON
1019 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1021 config = json.loads(config)
1023 self._downloader.trouble(u'ERROR: unable to extract info section')
1027 video_title = config["video"]["title"]
1030 video_uploader = config["video"]["owner"]["name"]
1032 # Extract video thumbnail
1033 video_thumbnail = config["video"]["thumbnail"]
1035 # Extract video description
1036 video_description = get_element_by_id("description", webpage.decode('utf8'))
1037 if video_description: video_description = clean_html(video_description)
1038 else: video_description = ''
1040 # Extract upload date
1041 video_upload_date = u'NA'
1042 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1043 if mobj is not None:
1044 video_upload_date = mobj.group(1)
1046 # Vimeo specific: extract request signature and timestamp
1047 sig = config['request']['signature']
1048 timestamp = config['request']['timestamp']
1050 # Vimeo specific: extract video codec and quality information
1051 # TODO bind to format param
1052 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1053 for codec in codecs:
1054 if codec[0] in config["video"]["files"]:
1055 video_codec = codec[0]
1056 video_extension = codec[1]
1057 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1058 else: quality = 'sd'
1061 self._downloader.trouble(u'ERROR: no known codec found')
1064 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1065 %(video_id, sig, timestamp, quality, video_codec.upper())
1070 'uploader': video_uploader,
1071 'upload_date': video_upload_date,
1072 'title': video_title,
1073 'ext': video_extension,
1074 'thumbnail': video_thumbnail,
1075 'description': video_description,
1080 class ArteTvIE(InfoExtractor):
1081 """arte.tv information extractor."""
1083 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1084 _LIVE_URL = r'index-[0-9]+\.html$'
1086 IE_NAME = u'arte.tv'
1088 def __init__(self, downloader=None):
1089 InfoExtractor.__init__(self, downloader)
1091 def report_download_webpage(self, video_id):
1092 """Report webpage download."""
1093 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1095 def report_extraction(self, video_id):
1096 """Report information extraction."""
1097 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1099 def fetch_webpage(self, url):
1100 self._downloader.increment_downloads()
1101 request = urllib2.Request(url)
1103 self.report_download_webpage(url)
1104 webpage = urllib2.urlopen(request).read()
1105 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1106 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1108 except ValueError, err:
1109 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1113 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1114 page = self.fetch_webpage(url)
1115 mobj = re.search(regex, page, regexFlags)
1119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1122 for (i, key, err) in matchTuples:
1123 if mobj.group(i) is None:
1124 self._downloader.trouble(err)
1127 info[key] = mobj.group(i)
1131 def extractLiveStream(self, url):
1133 video_lang = url.split('/')[-4]
1135 info = self.grep_webpage(
1137 r'src="(.*?/videothek_js.*?\.js)',
1140 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1144 http_host = url.split('/')[2]
1145 next_url = 'http://%s%s' % (http_host, urllib.unquote(info.get('url')))
1147 info = self.grep_webpage(
1149 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1150 '(http://.*?\.swf).*?' +
1154 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1155 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1156 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1160 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1162 print u'rtmpdump --swfVfy \'%s\' --rtmp \'%s\' --live -o arte-live.mp4' % (info.get('player'), video_url)
1164 def extractPlus7Stream(self, url):
1166 video_lang = url.split('/')[-3]
1168 info = self.grep_webpage(
1170 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1173 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1177 next_url = urllib.unquote(info.get('url'))
1179 info = self.grep_webpage(
1181 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1184 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1188 next_url = urllib.unquote(info.get('url'))
1190 info = self.grep_webpage(
1192 r'<video id="(.*?)".*?>.*?' +
1193 '<name>(.*?)</name>.*?' +
1194 '<dateVideo>(.*?)</dateVideo>.*?' +
1195 '<url quality="hd">(.*?)</url>',
1198 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1199 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1200 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1201 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1206 'id': info.get('id'),
1207 'url': urllib.unquote(info.get('url')),
1208 'uploader': u'arte.tv',
1209 'upload_date': info.get('date'),
1210 'title': info.get('title'),
1216 def _real_extract(self, url):
1218 video_id = url.split('/')[-1]
1220 self.report_extraction(video_id)
1222 if re.search(self._LIVE_URL, video_id) is not None:
1223 self.extractLiveStream(url)
1226 info = self.extractPlus7Stream(url)
1229 # Process video information
1230 self._downloader.process_info(info)
1231 except UnavailableVideoError, err:
1232 self._downloader.trouble(u'\nERROR: unable to download video')
1235 class GenericIE(InfoExtractor):
1236 """Generic last-resort information extractor."""
1239 IE_NAME = u'generic'
1241 def __init__(self, downloader=None):
1242 InfoExtractor.__init__(self, downloader)
1244 def report_download_webpage(self, video_id):
1245 """Report webpage download."""
1246 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1247 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1249 def report_extraction(self, video_id):
1250 """Report information extraction."""
1251 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1253 def report_following_redirect(self, new_url):
1254 """Report information extraction."""
1255 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1257 def _test_redirect(self, url):
1258 """Check if it is a redirect, like url shorteners, in case restart chain."""
1259 class HeadRequest(urllib2.Request):
1260 def get_method(self):
1263 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1265 Subclass the HTTPRedirectHandler to make it use our
1266 HeadRequest also on the redirected URL
1268 def redirect_request(self, req, fp, code, msg, headers, newurl):
1269 if code in (301, 302, 303, 307):
1270 newurl = newurl.replace(' ', '%20')
1271 newheaders = dict((k,v) for k,v in req.headers.items()
1272 if k.lower() not in ("content-length", "content-type"))
1273 return HeadRequest(newurl,
1275 origin_req_host=req.get_origin_req_host(),
1278 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1280 class HTTPMethodFallback(urllib2.BaseHandler):
1282 Fallback to GET if HEAD is not allowed (405 HTTP error)
1284 def http_error_405(self, req, fp, code, msg, headers):
1288 newheaders = dict((k,v) for k,v in req.headers.items()
1289 if k.lower() not in ("content-length", "content-type"))
1290 return self.parent.open(urllib2.Request(req.get_full_url(),
1292 origin_req_host=req.get_origin_req_host(),
1296 opener = urllib2.OpenerDirector()
1297 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1298 HTTPMethodFallback, HEADRedirectHandler,
1299 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1300 opener.add_handler(handler())
1302 response = opener.open(HeadRequest(url))
1303 new_url = response.geturl()
1305 if url == new_url: return False
1307 self.report_following_redirect(new_url)
1308 self._downloader.download([new_url])
1311 def _real_extract(self, url):
1312 if self._test_redirect(url): return
1314 video_id = url.split('/')[-1]
1315 request = urllib2.Request(url)
1317 self.report_download_webpage(video_id)
1318 webpage = urllib2.urlopen(request).read()
1319 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1320 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1322 except ValueError, err:
1323 # since this is the last-resort InfoExtractor, if
1324 # this error is thrown, it'll be thrown here
1325 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1328 self.report_extraction(video_id)
1329 # Start with something easy: JW Player in SWFObject
1330 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1332 # Broaden the search a little bit
1333 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1335 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1338 # It's possible that one of the regexes
1339 # matched, but returned an empty group:
1340 if mobj.group(1) is None:
1341 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1344 video_url = urllib.unquote(mobj.group(1))
1345 video_id = os.path.basename(video_url)
1347 # here's a fun little line of code for you:
1348 video_extension = os.path.splitext(video_id)[1][1:]
1349 video_id = os.path.splitext(video_id)[0]
1351 # it's tempting to parse this further, but you would
1352 # have to take into account all the variations like
1353 # Video Title - Site Name
1354 # Site Name | Video Title
1355 # Video Title - Tagline | Site Name
1356 # and so on and so forth; it's just not practical
1357 mobj = re.search(r'<title>(.*)</title>', webpage)
1359 self._downloader.trouble(u'ERROR: unable to extract title')
1361 video_title = mobj.group(1).decode('utf-8')
1363 # video uploader is domain name
1364 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1366 self._downloader.trouble(u'ERROR: unable to extract title')
1368 video_uploader = mobj.group(1).decode('utf-8')
1371 'id': video_id.decode('utf-8'),
1372 'url': video_url.decode('utf-8'),
1373 'uploader': video_uploader,
1374 'upload_date': u'NA',
1375 'title': video_title,
1376 'ext': video_extension.decode('utf-8'),
1382 class YoutubeSearchIE(InfoExtractor):
1383 """Information Extractor for YouTube search queries."""
1384 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1385 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1386 _max_youtube_results = 1000
1387 IE_NAME = u'youtube:search'
1389 def __init__(self, downloader=None):
1390 InfoExtractor.__init__(self, downloader)
1392 def report_download_page(self, query, pagenum):
1393 """Report attempt to download search page with given number."""
1394 query = query.decode(preferredencoding())
1395 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1397 def _real_extract(self, query):
1398 mobj = re.match(self._VALID_URL, query)
1400 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1403 prefix, query = query.split(':')
1405 query = query.encode('utf-8')
1407 self._download_n_results(query, 1)
1409 elif prefix == 'all':
1410 self._download_n_results(query, self._max_youtube_results)
1416 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1418 elif n > self._max_youtube_results:
1419 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1420 n = self._max_youtube_results
1421 self._download_n_results(query, n)
1423 except ValueError: # parsing prefix as integer fails
1424 self._download_n_results(query, 1)
1427 def _download_n_results(self, query, n):
1428 """Downloads a specified number of results for a query"""
1434 while (50 * pagenum) < limit:
1435 self.report_download_page(query, pagenum+1)
1436 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1437 request = urllib2.Request(result_url)
1439 data = urllib2.urlopen(request).read()
1440 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1441 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1443 api_response = json.loads(data)['data']
1445 new_ids = list(video['id'] for video in api_response['items'])
1446 video_ids += new_ids
1448 limit = min(n, api_response['totalItems'])
1451 if len(video_ids) > n:
1452 video_ids = video_ids[:n]
1453 for id in video_ids:
1454 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1458 class GoogleSearchIE(InfoExtractor):
1459 """Information Extractor for Google Video search queries."""
1460 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1461 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1462 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1463 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1464 _max_google_results = 1000
1465 IE_NAME = u'video.google:search'
1467 def __init__(self, downloader=None):
1468 InfoExtractor.__init__(self, downloader)
1470 def report_download_page(self, query, pagenum):
1471 """Report attempt to download playlist page with given number."""
1472 query = query.decode(preferredencoding())
1473 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1475 def _real_extract(self, query):
1476 mobj = re.match(self._VALID_URL, query)
1478 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1481 prefix, query = query.split(':')
1483 query = query.encode('utf-8')
1485 self._download_n_results(query, 1)
1487 elif prefix == 'all':
1488 self._download_n_results(query, self._max_google_results)
1494 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1496 elif n > self._max_google_results:
1497 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1498 n = self._max_google_results
1499 self._download_n_results(query, n)
1501 except ValueError: # parsing prefix as integer fails
1502 self._download_n_results(query, 1)
1505 def _download_n_results(self, query, n):
1506 """Downloads a specified number of results for a query"""
1512 self.report_download_page(query, pagenum)
1513 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1514 request = urllib2.Request(result_url)
1516 page = urllib2.urlopen(request).read()
1517 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1521 # Extract video identifiers
1522 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1523 video_id = mobj.group(1)
1524 if video_id not in video_ids:
1525 video_ids.append(video_id)
1526 if len(video_ids) == n:
1527 # Specified n videos reached
1528 for id in video_ids:
1529 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1532 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1533 for id in video_ids:
1534 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1537 pagenum = pagenum + 1
1540 class YahooSearchIE(InfoExtractor):
1541 """Information Extractor for Yahoo! Video search queries."""
1542 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1543 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1544 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1545 _MORE_PAGES_INDICATOR = r'\s*Next'
1546 _max_yahoo_results = 1000
1547 IE_NAME = u'video.yahoo:search'
1549 def __init__(self, downloader=None):
1550 InfoExtractor.__init__(self, downloader)
1552 def report_download_page(self, query, pagenum):
1553 """Report attempt to download playlist page with given number."""
1554 query = query.decode(preferredencoding())
1555 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1557 def _real_extract(self, query):
1558 mobj = re.match(self._VALID_URL, query)
1560 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1563 prefix, query = query.split(':')
1565 query = query.encode('utf-8')
1567 self._download_n_results(query, 1)
1569 elif prefix == 'all':
1570 self._download_n_results(query, self._max_yahoo_results)
1576 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1578 elif n > self._max_yahoo_results:
1579 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1580 n = self._max_yahoo_results
1581 self._download_n_results(query, n)
1583 except ValueError: # parsing prefix as integer fails
1584 self._download_n_results(query, 1)
1587 def _download_n_results(self, query, n):
1588 """Downloads a specified number of results for a query"""
1591 already_seen = set()
1595 self.report_download_page(query, pagenum)
1596 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1597 request = urllib2.Request(result_url)
1599 page = urllib2.urlopen(request).read()
1600 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1601 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1604 # Extract video identifiers
1605 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1606 video_id = mobj.group(1)
1607 if video_id not in already_seen:
1608 video_ids.append(video_id)
1609 already_seen.add(video_id)
1610 if len(video_ids) == n:
1611 # Specified n videos reached
1612 for id in video_ids:
1613 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1616 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1617 for id in video_ids:
1618 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1621 pagenum = pagenum + 1
1624 class YoutubePlaylistIE(InfoExtractor):
1625 """Information Extractor for YouTube playlists."""
1627 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1628 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1629 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=(PL)?%s&'
1630 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1631 IE_NAME = u'youtube:playlist'
1633 def __init__(self, downloader=None):
1634 InfoExtractor.__init__(self, downloader)
1636 def report_download_page(self, playlist_id, pagenum):
1637 """Report attempt to download playlist page with given number."""
1638 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1640 def _real_extract(self, url):
1641 # Extract playlist id
1642 mobj = re.match(self._VALID_URL, url)
1644 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1648 if mobj.group(3) is not None:
1649 self._downloader.download([mobj.group(3)])
1652 # Download playlist pages
1653 # prefix is 'p' as default for playlists but there are other types that need extra care
1654 playlist_prefix = mobj.group(1)
1655 if playlist_prefix == 'a':
1656 playlist_access = 'artist'
1658 playlist_prefix = 'p'
1659 playlist_access = 'view_play_list'
1660 playlist_id = mobj.group(2)
1665 self.report_download_page(playlist_id, pagenum)
1666 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1667 request = urllib2.Request(url)
1669 page = urllib2.urlopen(request).read()
1670 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1671 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1674 # Extract video identifiers
1676 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1677 if mobj.group(1) not in ids_in_page:
1678 ids_in_page.append(mobj.group(1))
1679 video_ids.extend(ids_in_page)
1681 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1683 pagenum = pagenum + 1
1685 playliststart = self._downloader.params.get('playliststart', 1) - 1
1686 playlistend = self._downloader.params.get('playlistend', -1)
1687 if playlistend == -1:
1688 video_ids = video_ids[playliststart:]
1690 video_ids = video_ids[playliststart:playlistend]
1692 for id in video_ids:
1693 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1697 class YoutubeUserIE(InfoExtractor):
1698 """Information Extractor for YouTube users."""
1700 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1701 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1702 _GDATA_PAGE_SIZE = 50
1703 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1704 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1705 IE_NAME = u'youtube:user'
1707 def __init__(self, downloader=None):
1708 InfoExtractor.__init__(self, downloader)
1710 def report_download_page(self, username, start_index):
1711 """Report attempt to download user page."""
1712 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1713 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1715 def _real_extract(self, url):
1717 mobj = re.match(self._VALID_URL, url)
1719 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1722 username = mobj.group(1)
1724 # Download video ids using YouTube Data API. Result size per
1725 # query is limited (currently to 50 videos) so we need to query
1726 # page by page until there are no video ids - it means we got
1733 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1734 self.report_download_page(username, start_index)
1736 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1739 page = urllib2.urlopen(request).read()
1740 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1741 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1744 # Extract video identifiers
1747 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1748 if mobj.group(1) not in ids_in_page:
1749 ids_in_page.append(mobj.group(1))
1751 video_ids.extend(ids_in_page)
1753 # A little optimization - if current page is not
1754 # "full", ie. does not contain PAGE_SIZE video ids then
1755 # we can assume that this page is the last one - there
1756 # are no more ids on further pages - no need to query
1759 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1764 all_ids_count = len(video_ids)
1765 playliststart = self._downloader.params.get('playliststart', 1) - 1
1766 playlistend = self._downloader.params.get('playlistend', -1)
1768 if playlistend == -1:
1769 video_ids = video_ids[playliststart:]
1771 video_ids = video_ids[playliststart:playlistend]
1773 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1774 (username, all_ids_count, len(video_ids)))
1776 for video_id in video_ids:
1777 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1780 class BlipTVUserIE(InfoExtractor):
1781 """Information Extractor for blip.tv users."""
1783 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1785 IE_NAME = u'blip.tv:user'
1787 def __init__(self, downloader=None):
1788 InfoExtractor.__init__(self, downloader)
1790 def report_download_page(self, username, pagenum):
1791 """Report attempt to download user page."""
1792 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1793 (self.IE_NAME, username, pagenum))
1795 def _real_extract(self, url):
1797 mobj = re.match(self._VALID_URL, url)
1799 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1802 username = mobj.group(1)
1804 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1806 request = urllib2.Request(url)
1809 page = urllib2.urlopen(request).read().decode('utf-8')
1810 mobj = re.search(r'data-users-id="([^"]+)"', page)
1811 page_base = page_base % mobj.group(1)
1812 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1813 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1817 # Download video ids using BlipTV Ajax calls. Result size per
1818 # query is limited (currently to 12 videos) so we need to query
1819 # page by page until there are no video ids - it means we got
1826 self.report_download_page(username, pagenum)
1828 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1831 page = urllib2.urlopen(request).read().decode('utf-8')
1832 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1833 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1836 # Extract video identifiers
1839 for mobj in re.finditer(r'href="/([^"]+)"', page):
1840 if mobj.group(1) not in ids_in_page:
1841 ids_in_page.append(unescapeHTML(mobj.group(1)))
1843 video_ids.extend(ids_in_page)
1845 # A little optimization - if current page is not
1846 # "full", ie. does not contain PAGE_SIZE video ids then
1847 # we can assume that this page is the last one - there
1848 # are no more ids on further pages - no need to query
1851 if len(ids_in_page) < self._PAGE_SIZE:
1856 all_ids_count = len(video_ids)
1857 playliststart = self._downloader.params.get('playliststart', 1) - 1
1858 playlistend = self._downloader.params.get('playlistend', -1)
1860 if playlistend == -1:
1861 video_ids = video_ids[playliststart:]
1863 video_ids = video_ids[playliststart:playlistend]
1865 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1866 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1868 for video_id in video_ids:
1869 self._downloader.download([u'http://blip.tv/'+video_id])
1872 class DepositFilesIE(InfoExtractor):
1873 """Information extractor for depositfiles.com"""
1875 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1876 IE_NAME = u'DepositFiles'
1878 def __init__(self, downloader=None):
1879 InfoExtractor.__init__(self, downloader)
1881 def report_download_webpage(self, file_id):
1882 """Report webpage download."""
1883 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1885 def report_extraction(self, file_id):
1886 """Report information extraction."""
1887 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1889 def _real_extract(self, url):
1890 file_id = url.split('/')[-1]
1891 # Rebuild url in english locale
1892 url = 'http://depositfiles.com/en/files/' + file_id
1894 # Retrieve file webpage with 'Free download' button pressed
1895 free_download_indication = { 'gateway_result' : '1' }
1896 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1898 self.report_download_webpage(file_id)
1899 webpage = urllib2.urlopen(request).read()
1900 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1901 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1904 # Search for the real file URL
1905 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1906 if (mobj is None) or (mobj.group(1) is None):
1907 # Try to figure out reason of the error.
1908 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1909 if (mobj is not None) and (mobj.group(1) is not None):
1910 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1911 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1913 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1916 file_url = mobj.group(1)
1917 file_extension = os.path.splitext(file_url)[1][1:]
1919 # Search for file title
1920 mobj = re.search(r'<b title="(.*?)">', webpage)
1922 self._downloader.trouble(u'ERROR: unable to extract title')
1924 file_title = mobj.group(1).decode('utf-8')
1927 'id': file_id.decode('utf-8'),
1928 'url': file_url.decode('utf-8'),
1930 'upload_date': u'NA',
1931 'title': file_title,
1932 'ext': file_extension.decode('utf-8'),
1938 class FacebookIE(InfoExtractor):
1939 """Information Extractor for Facebook"""
1941 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1942 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1943 _NETRC_MACHINE = 'facebook'
1944 _available_formats = ['video', 'highqual', 'lowqual']
1945 _video_extensions = {
1950 IE_NAME = u'facebook'
1952 def __init__(self, downloader=None):
1953 InfoExtractor.__init__(self, downloader)
1955 def _reporter(self, message):
1956 """Add header and report message."""
1957 self._downloader.to_screen(u'[facebook] %s' % message)
1959 def report_login(self):
1960 """Report attempt to log in."""
1961 self._reporter(u'Logging in')
1963 def report_video_webpage_download(self, video_id):
1964 """Report attempt to download video webpage."""
1965 self._reporter(u'%s: Downloading video webpage' % video_id)
1967 def report_information_extraction(self, video_id):
1968 """Report attempt to extract video information."""
1969 self._reporter(u'%s: Extracting video information' % video_id)
1971 def _parse_page(self, video_webpage):
1972 """Extract video information from page"""
1974 data = {'title': r'\("video_title", "(.*?)"\)',
1975 'description': r'<div class="datawrap">(.*?)</div>',
1976 'owner': r'\("video_owner_name", "(.*?)"\)',
1977 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1980 for piece in data.keys():
1981 mobj = re.search(data[piece], video_webpage)
1982 if mobj is not None:
1983 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1987 for fmt in self._available_formats:
1988 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1989 if mobj is not None:
1990 # URL is in a Javascript segment inside an escaped Unicode format within
1991 # the generally utf-8 page
1992 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1993 video_info['video_urls'] = video_urls
1997 def _real_initialize(self):
1998 if self._downloader is None:
2003 downloader_params = self._downloader.params
2005 # Attempt to use provided username and password or .netrc data
2006 if downloader_params.get('username', None) is not None:
2007 useremail = downloader_params['username']
2008 password = downloader_params['password']
2009 elif downloader_params.get('usenetrc', False):
2011 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2012 if info is not None:
2016 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2017 except (IOError, netrc.NetrcParseError), err:
2018 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2021 if useremail is None:
2030 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2033 login_results = urllib2.urlopen(request).read()
2034 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2035 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2037 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2038 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2041 def _real_extract(self, url):
2042 mobj = re.match(self._VALID_URL, url)
2044 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2046 video_id = mobj.group('ID')
2049 self.report_video_webpage_download(video_id)
2050 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2052 page = urllib2.urlopen(request)
2053 video_webpage = page.read()
2054 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2055 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2058 # Start extracting information
2059 self.report_information_extraction(video_id)
2061 # Extract information
2062 video_info = self._parse_page(video_webpage)
2065 if 'owner' not in video_info:
2066 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2068 video_uploader = video_info['owner']
2071 if 'title' not in video_info:
2072 self._downloader.trouble(u'ERROR: unable to extract video title')
2074 video_title = video_info['title']
2075 video_title = video_title.decode('utf-8')
2078 if 'thumbnail' not in video_info:
2079 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2080 video_thumbnail = ''
2082 video_thumbnail = video_info['thumbnail']
2086 if 'upload_date' in video_info:
2087 upload_time = video_info['upload_date']
2088 timetuple = email.utils.parsedate_tz(upload_time)
2089 if timetuple is not None:
2091 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2096 video_description = video_info.get('description', 'No description available.')
2098 url_map = video_info['video_urls']
2099 if len(url_map.keys()) > 0:
2100 # Decide which formats to download
2101 req_format = self._downloader.params.get('format', None)
2102 format_limit = self._downloader.params.get('format_limit', None)
2104 if format_limit is not None and format_limit in self._available_formats:
2105 format_list = self._available_formats[self._available_formats.index(format_limit):]
2107 format_list = self._available_formats
2108 existing_formats = [x for x in format_list if x in url_map]
2109 if len(existing_formats) == 0:
2110 self._downloader.trouble(u'ERROR: no known formats available for video')
2112 if req_format is None:
2113 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2114 elif req_format == 'worst':
2115 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2116 elif req_format == '-1':
2117 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2120 if req_format not in url_map:
2121 self._downloader.trouble(u'ERROR: requested format not available')
2123 video_url_list = [(req_format, url_map[req_format])] # Specific format
2126 for format_param, video_real_url in video_url_list:
2128 video_extension = self._video_extensions.get(format_param, 'mp4')
2131 'id': video_id.decode('utf-8'),
2132 'url': video_real_url.decode('utf-8'),
2133 'uploader': video_uploader.decode('utf-8'),
2134 'upload_date': upload_date,
2135 'title': video_title,
2136 'ext': video_extension.decode('utf-8'),
2137 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2138 'thumbnail': video_thumbnail.decode('utf-8'),
2139 'description': video_description.decode('utf-8'),
2144 class BlipTVIE(InfoExtractor):
2145 """Information extractor for blip.tv"""
2147 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2148 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2149 IE_NAME = u'blip.tv'
2151 def report_extraction(self, file_id):
2152 """Report information extraction."""
2153 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2155 def report_direct_download(self, title):
2156 """Report information extraction."""
2157 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2159 def _real_extract(self, url):
2160 mobj = re.match(self._VALID_URL, url)
2162 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2169 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2170 request = urllib2.Request(json_url.encode('utf-8'))
2171 self.report_extraction(mobj.group(1))
2174 urlh = urllib2.urlopen(request)
2175 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2176 basename = url.split('/')[-1]
2177 title,ext = os.path.splitext(basename)
2178 title = title.decode('UTF-8')
2179 ext = ext.replace('.', '')
2180 self.report_direct_download(title)
2188 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2189 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2191 if info is None: # Regular URL
2193 json_code = urlh.read()
2194 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2195 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2199 json_data = json.loads(json_code)
2200 if 'Post' in json_data:
2201 data = json_data['Post']
2205 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2206 video_url = data['media']['url']
2207 umobj = re.match(self._URL_EXT, video_url)
2209 raise ValueError('Can not determine filename extension')
2210 ext = umobj.group(1)
2213 'id': data['item_id'],
2215 'uploader': data['display_name'],
2216 'upload_date': upload_date,
2217 'title': data['title'],
2219 'format': data['media']['mimeType'],
2220 'thumbnail': data['thumbnailUrl'],
2221 'description': data['description'],
2222 'player_url': data['embedUrl']
2224 except (ValueError,KeyError), err:
2225 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2228 std_headers['User-Agent'] = 'iTunes/10.6.1'
2232 class MyVideoIE(InfoExtractor):
2233 """Information Extractor for myvideo.de."""
2235 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2236 IE_NAME = u'myvideo'
2238 def __init__(self, downloader=None):
2239 InfoExtractor.__init__(self, downloader)
2241 def report_download_webpage(self, video_id):
2242 """Report webpage download."""
2243 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2245 def report_extraction(self, video_id):
2246 """Report information extraction."""
2247 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2249 def _real_extract(self,url):
2250 mobj = re.match(self._VALID_URL, url)
2252 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2255 video_id = mobj.group(1)
2258 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2260 self.report_download_webpage(video_id)
2261 webpage = urllib2.urlopen(request).read()
2262 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2263 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2266 self.report_extraction(video_id)
2267 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2270 self._downloader.trouble(u'ERROR: unable to extract media URL')
2272 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2274 mobj = re.search('<title>([^<]+)</title>', webpage)
2276 self._downloader.trouble(u'ERROR: unable to extract title')
2279 video_title = mobj.group(1)
2285 'upload_date': u'NA',
2286 'title': video_title,
2292 class ComedyCentralIE(InfoExtractor):
2293 """Information extractor for The Daily Show and Colbert Report """
2295 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2296 IE_NAME = u'comedycentral'
2298 def report_extraction(self, episode_id):
2299 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2301 def report_config_download(self, episode_id):
2302 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2304 def report_index_download(self, episode_id):
2305 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2307 def report_player_url(self, episode_id):
2308 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2310 def _real_extract(self, url):
2311 mobj = re.match(self._VALID_URL, url)
2313 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2316 if mobj.group('shortname'):
2317 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2318 url = u'http://www.thedailyshow.com/full-episodes/'
2320 url = u'http://www.colbertnation.com/full-episodes/'
2321 mobj = re.match(self._VALID_URL, url)
2322 assert mobj is not None
2324 dlNewest = not mobj.group('episode')
2326 epTitle = mobj.group('showname')
2328 epTitle = mobj.group('episode')
2330 req = urllib2.Request(url)
2331 self.report_extraction(epTitle)
2333 htmlHandle = urllib2.urlopen(req)
2334 html = htmlHandle.read()
2335 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2336 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2339 url = htmlHandle.geturl()
2340 mobj = re.match(self._VALID_URL, url)
2342 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2344 if mobj.group('episode') == '':
2345 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2347 epTitle = mobj.group('episode')
2349 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2350 if len(mMovieParams) == 0:
2351 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2354 playerUrl_raw = mMovieParams[0][0]
2355 self.report_player_url(epTitle)
2357 urlHandle = urllib2.urlopen(playerUrl_raw)
2358 playerUrl = urlHandle.geturl()
2359 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2360 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2363 uri = mMovieParams[0][1]
2364 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2365 self.report_index_download(epTitle)
2367 indexXml = urllib2.urlopen(indexUrl).read()
2368 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2369 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2374 idoc = xml.etree.ElementTree.fromstring(indexXml)
2375 itemEls = idoc.findall('.//item')
2376 for itemEl in itemEls:
2377 mediaId = itemEl.findall('./guid')[0].text
2378 shortMediaId = mediaId.split(':')[-1]
2379 showId = mediaId.split(':')[-2].replace('.com', '')
2380 officialTitle = itemEl.findall('./title')[0].text
2381 officialDate = itemEl.findall('./pubDate')[0].text
2383 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2384 urllib.urlencode({'uri': mediaId}))
2385 configReq = urllib2.Request(configUrl)
2386 self.report_config_download(epTitle)
2388 configXml = urllib2.urlopen(configReq).read()
2389 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2390 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2393 cdoc = xml.etree.ElementTree.fromstring(configXml)
2395 for rendition in cdoc.findall('.//rendition'):
2396 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2400 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2403 # For now, just pick the highest bitrate
2404 format,video_url = turls[-1]
2406 effTitle = showId + u'-' + epTitle
2411 'upload_date': officialDate,
2416 'description': officialTitle,
2417 'player_url': playerUrl
2420 results.append(info)
2425 class EscapistIE(InfoExtractor):
2426 """Information extractor for The Escapist """
2428 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2429 IE_NAME = u'escapist'
2431 def report_extraction(self, showName):
2432 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2434 def report_config_download(self, showName):
2435 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2437 def _real_extract(self, url):
2438 mobj = re.match(self._VALID_URL, url)
2440 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2442 showName = mobj.group('showname')
2443 videoId = mobj.group('episode')
2445 self.report_extraction(showName)
2447 webPage = urllib2.urlopen(url)
2448 webPageBytes = webPage.read()
2449 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2450 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2451 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2452 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2455 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2456 description = unescapeHTML(descMatch.group(1))
2457 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2458 imgUrl = unescapeHTML(imgMatch.group(1))
2459 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2460 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2461 configUrlMatch = re.search('config=(.*)$', playerUrl)
2462 configUrl = urllib2.unquote(configUrlMatch.group(1))
2464 self.report_config_download(showName)
2466 configJSON = urllib2.urlopen(configUrl).read()
2467 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2471 # Technically, it's JavaScript, not JSON
2472 configJSON = configJSON.replace("'", '"')
2475 config = json.loads(configJSON)
2476 except (ValueError,), err:
2477 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2480 playlist = config['playlist']
2481 videoUrl = playlist[1]['url']
2486 'uploader': showName,
2487 'upload_date': None,
2491 'thumbnail': imgUrl,
2492 'description': description,
2493 'player_url': playerUrl,
2499 class CollegeHumorIE(InfoExtractor):
2500 """Information extractor for collegehumor.com"""
2502 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2503 IE_NAME = u'collegehumor'
2505 def report_webpage(self, video_id):
2506 """Report information extraction."""
2507 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2509 def report_extraction(self, video_id):
2510 """Report information extraction."""
2511 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2513 def _real_extract(self, url):
2514 mobj = re.match(self._VALID_URL, url)
2516 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2518 video_id = mobj.group('videoid')
2520 self.report_webpage(video_id)
2521 request = urllib2.Request(url)
2523 webpage = urllib2.urlopen(request).read()
2524 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2525 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2528 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2530 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2532 internal_video_id = m.group('internalvideoid')
2536 'internal_id': internal_video_id,
2539 self.report_extraction(video_id)
2540 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2542 metaXml = urllib2.urlopen(xmlUrl).read()
2543 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2544 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2547 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2549 videoNode = mdoc.findall('./video')[0]
2550 info['description'] = videoNode.findall('./description')[0].text
2551 info['title'] = videoNode.findall('./caption')[0].text
2552 info['url'] = videoNode.findall('./file')[0].text
2553 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2554 info['ext'] = info['url'].rpartition('.')[2]
2555 info['format'] = info['ext']
2557 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2563 class XVideosIE(InfoExtractor):
2564 """Information extractor for xvideos.com"""
2566 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2567 IE_NAME = u'xvideos'
2569 def report_webpage(self, video_id):
2570 """Report information extraction."""
2571 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2573 def report_extraction(self, video_id):
2574 """Report information extraction."""
2575 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2577 def _real_extract(self, url):
2578 mobj = re.match(self._VALID_URL, url)
2580 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2582 video_id = mobj.group(1).decode('utf-8')
2584 self.report_webpage(video_id)
2586 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2588 webpage = urllib2.urlopen(request).read()
2589 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2590 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2593 self.report_extraction(video_id)
2597 mobj = re.search(r'flv_url=(.+?)&', webpage)
2599 self._downloader.trouble(u'ERROR: unable to extract video url')
2601 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2605 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2607 self._downloader.trouble(u'ERROR: unable to extract video title')
2609 video_title = mobj.group(1).decode('utf-8')
2612 # Extract video thumbnail
2613 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2615 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2617 video_thumbnail = mobj.group(0).decode('utf-8')
2623 'upload_date': None,
2624 'title': video_title,
2627 'thumbnail': video_thumbnail,
2628 'description': None,
2635 class SoundcloudIE(InfoExtractor):
2636 """Information extractor for soundcloud.com
2637 To access the media, the uid of the song and a stream token
2638 must be extracted from the page source and the script must make
2639 a request to media.soundcloud.com/crossdomain.xml. Then
2640 the media can be grabbed by requesting from an url composed
2641 of the stream token and uid
2644 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2645 IE_NAME = u'soundcloud'
2647 def __init__(self, downloader=None):
2648 InfoExtractor.__init__(self, downloader)
2650 def report_webpage(self, video_id):
2651 """Report information extraction."""
2652 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2654 def report_extraction(self, video_id):
2655 """Report information extraction."""
2656 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2658 def _real_extract(self, url):
2659 mobj = re.match(self._VALID_URL, url)
2661 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2664 # extract uploader (which is in the url)
2665 uploader = mobj.group(1).decode('utf-8')
2666 # extract simple title (uploader + slug of song title)
2667 slug_title = mobj.group(2).decode('utf-8')
2668 simple_title = uploader + u'-' + slug_title
2670 self.report_webpage('%s/%s' % (uploader, slug_title))
2672 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2674 webpage = urllib2.urlopen(request).read()
2675 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2676 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2679 self.report_extraction('%s/%s' % (uploader, slug_title))
2681 # extract uid and stream token that soundcloud hands out for access
2682 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2684 video_id = mobj.group(1)
2685 stream_token = mobj.group(2)
2687 # extract unsimplified title
2688 mobj = re.search('"title":"(.*?)",', webpage)
2690 title = mobj.group(1).decode('utf-8')
2692 title = simple_title
2694 # construct media url (with uid/token)
2695 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2696 mediaURL = mediaURL % (video_id, stream_token)
2699 description = u'No description available'
2700 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2702 description = mobj.group(1)
2706 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2709 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2710 except Exception, e:
2711 self._downloader.to_stderr(str(e))
2713 # for soundcloud, a request to a cross domain is required for cookies
2714 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2717 'id': video_id.decode('utf-8'),
2719 'uploader': uploader.decode('utf-8'),
2720 'upload_date': upload_date,
2725 'description': description.decode('utf-8')
2729 class InfoQIE(InfoExtractor):
2730 """Information extractor for infoq.com"""
2732 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2735 def report_webpage(self, video_id):
2736 """Report information extraction."""
2737 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2739 def report_extraction(self, video_id):
2740 """Report information extraction."""
2741 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2743 def _real_extract(self, url):
2744 mobj = re.match(self._VALID_URL, url)
2746 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2749 self.report_webpage(url)
2751 request = urllib2.Request(url)
2753 webpage = urllib2.urlopen(request).read()
2754 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2755 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2758 self.report_extraction(url)
2762 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2764 self._downloader.trouble(u'ERROR: unable to extract video url')
2766 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2770 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2772 self._downloader.trouble(u'ERROR: unable to extract video title')
2774 video_title = mobj.group(1).decode('utf-8')
2776 # Extract description
2777 video_description = u'No description available.'
2778 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2779 if mobj is not None:
2780 video_description = mobj.group(1).decode('utf-8')
2782 video_filename = video_url.split('/')[-1]
2783 video_id, extension = video_filename.split('.')
2789 'upload_date': None,
2790 'title': video_title,
2792 'format': extension, # Extension is always(?) mp4, but seems to be flv
2794 'description': video_description,
2800 class MixcloudIE(InfoExtractor):
2801 """Information extractor for www.mixcloud.com"""
2802 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2803 IE_NAME = u'mixcloud'
2805 def __init__(self, downloader=None):
2806 InfoExtractor.__init__(self, downloader)
2808 def report_download_json(self, file_id):
2809 """Report JSON download."""
2810 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2812 def report_extraction(self, file_id):
2813 """Report information extraction."""
2814 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2816 def get_urls(self, jsonData, fmt, bitrate='best'):
2817 """Get urls from 'audio_formats' section in json"""
2820 bitrate_list = jsonData[fmt]
2821 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2822 bitrate = max(bitrate_list) # select highest
2824 url_list = jsonData[fmt][bitrate]
2825 except TypeError: # we have no bitrate info.
2826 url_list = jsonData[fmt]
2829 def check_urls(self, url_list):
2830 """Returns 1st active url from list"""
2831 for url in url_list:
2833 urllib2.urlopen(url)
2835 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2840 def _print_formats(self, formats):
2841 print 'Available formats:'
2842 for fmt in formats.keys():
2843 for b in formats[fmt]:
2845 ext = formats[fmt][b][0]
2846 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2847 except TypeError: # we have no bitrate info
2848 ext = formats[fmt][0]
2849 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2852 def _real_extract(self, url):
2853 mobj = re.match(self._VALID_URL, url)
2855 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2857 # extract uploader & filename from url
2858 uploader = mobj.group(1).decode('utf-8')
2859 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2861 # construct API request
2862 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2863 # retrieve .json file with links to files
2864 request = urllib2.Request(file_url)
2866 self.report_download_json(file_url)
2867 jsonData = urllib2.urlopen(request).read()
2868 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2869 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2873 json_data = json.loads(jsonData)
2874 player_url = json_data['player_swf_url']
2875 formats = dict(json_data['audio_formats'])
2877 req_format = self._downloader.params.get('format', None)
2880 if self._downloader.params.get('listformats', None):
2881 self._print_formats(formats)
2884 if req_format is None or req_format == 'best':
2885 for format_param in formats.keys():
2886 url_list = self.get_urls(formats, format_param)
2888 file_url = self.check_urls(url_list)
2889 if file_url is not None:
2892 if req_format not in formats.keys():
2893 self._downloader.trouble(u'ERROR: format is not available')
2896 url_list = self.get_urls(formats, req_format)
2897 file_url = self.check_urls(url_list)
2898 format_param = req_format
2901 'id': file_id.decode('utf-8'),
2902 'url': file_url.decode('utf-8'),
2903 'uploader': uploader.decode('utf-8'),
2904 'upload_date': u'NA',
2905 'title': json_data['name'],
2906 'ext': file_url.split('.')[-1].decode('utf-8'),
2907 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2908 'thumbnail': json_data['thumbnail_url'],
2909 'description': json_data['description'],
2910 'player_url': player_url.decode('utf-8'),
2913 class StanfordOpenClassroomIE(InfoExtractor):
2914 """Information extractor for Stanford's Open ClassRoom"""
2916 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2917 IE_NAME = u'stanfordoc'
2919 def report_download_webpage(self, objid):
2920 """Report information extraction."""
2921 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2923 def report_extraction(self, video_id):
2924 """Report information extraction."""
2925 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2927 def _real_extract(self, url):
2928 mobj = re.match(self._VALID_URL, url)
2930 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2933 if mobj.group('course') and mobj.group('video'): # A specific video
2934 course = mobj.group('course')
2935 video = mobj.group('video')
2937 'id': course + '_' + video,
2940 self.report_extraction(info['id'])
2941 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2942 xmlUrl = baseUrl + video + '.xml'
2944 metaXml = urllib2.urlopen(xmlUrl).read()
2945 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2946 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2948 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2950 info['title'] = mdoc.findall('./title')[0].text
2951 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2953 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2955 info['ext'] = info['url'].rpartition('.')[2]
2956 info['format'] = info['ext']
2958 elif mobj.group('course'): # A course page
2959 course = mobj.group('course')
2965 self.report_download_webpage(info['id'])
2967 coursepage = urllib2.urlopen(url).read()
2968 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2969 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2972 m = re.search('<h1>([^<]+)</h1>', coursepage)
2974 info['title'] = unescapeHTML(m.group(1))
2976 info['title'] = info['id']
2978 m = re.search('<description>([^<]+)</description>', coursepage)
2980 info['description'] = unescapeHTML(m.group(1))
2982 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2985 'type': 'reference',
2986 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2990 for entry in info['list']:
2991 assert entry['type'] == 'reference'
2992 results += self.extract(entry['url'])
2997 'id': 'Stanford OpenClassroom',
3001 self.report_download_webpage(info['id'])
3002 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3004 rootpage = urllib2.urlopen(rootURL).read()
3005 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3006 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3009 info['title'] = info['id']
3011 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3014 'type': 'reference',
3015 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3020 for entry in info['list']:
3021 assert entry['type'] == 'reference'
3022 results += self.extract(entry['url'])
3025 class MTVIE(InfoExtractor):
3026 """Information extractor for MTV.com"""
3028 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3031 def report_webpage(self, video_id):
3032 """Report information extraction."""
3033 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3035 def report_extraction(self, video_id):
3036 """Report information extraction."""
3037 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3039 def _real_extract(self, url):
3040 mobj = re.match(self._VALID_URL, url)
3042 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3044 if not mobj.group('proto'):
3045 url = 'http://' + url
3046 video_id = mobj.group('videoid')
3047 self.report_webpage(video_id)
3049 request = urllib2.Request(url)
3051 webpage = urllib2.urlopen(request).read()
3052 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3053 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3056 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3058 self._downloader.trouble(u'ERROR: unable to extract song name')
3060 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3061 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3063 self._downloader.trouble(u'ERROR: unable to extract performer')
3065 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066 video_title = performer + ' - ' + song_name
3068 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3070 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3072 mtvn_uri = mobj.group(1)
3074 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3076 self._downloader.trouble(u'ERROR: unable to extract content id')
3078 content_id = mobj.group(1)
3080 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3081 self.report_extraction(video_id)
3082 request = urllib2.Request(videogen_url)
3084 metadataXml = urllib2.urlopen(request).read()
3085 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3086 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3089 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3090 renditions = mdoc.findall('.//rendition')
3092 # For now, always pick the highest quality.
3093 rendition = renditions[-1]
3096 _,_,ext = rendition.attrib['type'].partition('/')
3097 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3098 video_url = rendition.find('./src').text
3100 self._downloader.trouble('Invalid rendition field.')
3106 'uploader': performer,
3107 'title': video_title,