2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
42 uploader: Nickname of the video uploader.
44 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
67 self.set_downloader(downloader)
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
74 """Initializes an instance (authentication, etc)."""
76 self._real_initialize()
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
82 return self._real_extract(url)
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
100 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|tube\.majestyc\.net/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
101 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
102 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
103 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
104 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
105 _NETRC_MACHINE = 'youtube'
106 # Listed in order of quality
107 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
108 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
109 _video_extensions = {
115 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
121 _video_dimensions = {
139 def report_lang(self):
140 """Report attempt to set language."""
141 self._downloader.to_screen(u'[youtube] Setting language')
143 def report_login(self):
144 """Report attempt to log in."""
145 self._downloader.to_screen(u'[youtube] Logging in')
147 def report_age_confirmation(self):
148 """Report attempt to confirm age."""
149 self._downloader.to_screen(u'[youtube] Confirming age')
151 def report_video_webpage_download(self, video_id):
152 """Report attempt to download video webpage."""
153 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
155 def report_video_info_webpage_download(self, video_id):
156 """Report attempt to download video info webpage."""
157 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
159 def report_video_subtitles_download(self, video_id):
160 """Report attempt to download video info webpage."""
161 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
163 def report_information_extraction(self, video_id):
164 """Report attempt to extract video information."""
165 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
167 def report_unavailable_format(self, video_id, format):
168 """Report extracted video URL."""
169 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
171 def report_rtmp_download(self):
172 """Indicate the download will use the RTMP protocol."""
173 self._downloader.to_screen(u'[youtube] RTMP download detected')
175 def _closed_captions_xml_to_srt(self, xml_string):
177 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
178 # TODO parse xml instead of regex
179 for n, (start, dur_tag, dur, caption) in enumerate(texts):
180 if not dur: dur = '4'
182 end = start + float(dur)
183 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
184 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
185 caption = unescapeHTML(caption)
186 caption = unescapeHTML(caption) # double cycle, intentional
187 srt += str(n+1) + '\n'
188 srt += start + ' --> ' + end + '\n'
189 srt += caption + '\n\n'
192 def _print_formats(self, formats):
193 print 'Available formats:'
195 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
197 def _real_initialize(self):
198 if self._downloader is None:
203 downloader_params = self._downloader.params
205 # Attempt to use provided username and password or .netrc data
206 if downloader_params.get('username', None) is not None:
207 username = downloader_params['username']
208 password = downloader_params['password']
209 elif downloader_params.get('usenetrc', False):
211 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
216 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
217 except (IOError, netrc.NetrcParseError), err:
218 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
222 request = urllib2.Request(self._LANG_URL)
225 urllib2.urlopen(request).read()
226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
227 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
230 # No authentication to be performed
236 'current_form': 'loginForm',
238 'action_login': 'Log In',
239 'username': username,
240 'password': password,
242 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
245 login_results = urllib2.urlopen(request).read()
246 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
247 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
250 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
256 'action_confirm': 'Confirm',
258 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
260 self.report_age_confirmation()
261 age_results = urllib2.urlopen(request).read()
262 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
263 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
266 def _real_extract(self, url):
267 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
268 mobj = re.search(self._NEXT_URL_RE, url)
270 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
272 # Extract video id from URL
273 mobj = re.match(self._VALID_URL, url)
275 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
277 video_id = mobj.group(2)
280 self.report_video_webpage_download(video_id)
281 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
283 video_webpage = urllib2.urlopen(request).read()
284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
285 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
288 # Attempt to extract SWF player URL
289 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
291 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
296 self.report_video_info_webpage_download(video_id)
297 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
298 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
299 % (video_id, el_type))
300 request = urllib2.Request(video_info_url)
302 video_info_webpage = urllib2.urlopen(request).read()
303 video_info = parse_qs(video_info_webpage)
304 if 'token' in video_info:
306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
307 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
309 if 'token' not in video_info:
310 if 'reason' in video_info:
311 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
313 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
316 # Check for "rental" videos
317 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
318 self._downloader.trouble(u'ERROR: "rental" videos not supported')
321 # Start extracting information
322 self.report_information_extraction(video_id)
325 if 'author' not in video_info:
326 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
328 video_uploader = urllib.unquote_plus(video_info['author'][0])
331 if 'title' not in video_info:
332 self._downloader.trouble(u'ERROR: unable to extract video title')
334 video_title = urllib.unquote_plus(video_info['title'][0])
335 video_title = video_title.decode('utf-8')
338 if 'thumbnail_url' not in video_info:
339 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
341 else: # don't panic if we can't find it
342 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
346 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
348 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
349 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
350 for expression in format_expressions:
352 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
357 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
358 if video_description: video_description = clean_html(video_description)
359 else: video_description = ''
362 video_subtitles = None
363 if self._downloader.params.get('writesubtitles', False):
365 self.report_video_subtitles_download(video_id)
366 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
368 srt_list = urllib2.urlopen(request).read()
369 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
370 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
371 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
372 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
373 if not srt_lang_list:
374 raise Trouble(u'WARNING: video has no closed captions')
375 if self._downloader.params.get('subtitleslang', False):
376 srt_lang = self._downloader.params.get('subtitleslang')
377 elif 'en' in srt_lang_list:
380 srt_lang = srt_lang_list.keys()[0]
381 if not srt_lang in srt_lang_list:
382 raise Trouble(u'WARNING: no closed captions found in the specified language')
383 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
385 srt_xml = urllib2.urlopen(request).read()
386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
387 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
389 raise Trouble(u'WARNING: unable to download video subtitles')
390 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
391 except Trouble as trouble:
392 self._downloader.trouble(trouble[0])
395 video_token = urllib.unquote_plus(video_info['token'][0])
397 # Decide which formats to download
398 req_format = self._downloader.params.get('format', None)
400 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
401 self.report_rtmp_download()
402 video_url_list = [(None, video_info['conn'][0])]
403 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
404 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
405 url_data = [parse_qs(uds) for uds in url_data_strs]
406 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
407 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
409 format_limit = self._downloader.params.get('format_limit', None)
410 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
411 if format_limit is not None and format_limit in available_formats:
412 format_list = available_formats[available_formats.index(format_limit):]
414 format_list = available_formats
415 existing_formats = [x for x in format_list if x in url_map]
416 if len(existing_formats) == 0:
417 self._downloader.trouble(u'ERROR: no known formats available for video')
419 if self._downloader.params.get('listformats', None):
420 self._print_formats(existing_formats)
422 if req_format is None or req_format == 'best':
423 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
424 elif req_format == 'worst':
425 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
426 elif req_format in ('-1', 'all'):
427 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
429 # Specific formats. We pick the first in a slash-delimeted sequence.
430 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
431 req_formats = req_format.split('/')
432 video_url_list = None
433 for rf in req_formats:
435 video_url_list = [(rf, url_map[rf])]
437 if video_url_list is None:
438 self._downloader.trouble(u'ERROR: requested format not available')
441 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
445 for format_param, video_real_url in video_url_list:
447 video_extension = self._video_extensions.get(format_param, 'flv')
450 'id': video_id.decode('utf-8'),
451 'url': video_real_url.decode('utf-8'),
452 'uploader': video_uploader.decode('utf-8'),
453 'upload_date': upload_date,
454 'title': video_title,
455 'ext': video_extension.decode('utf-8'),
456 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
457 'thumbnail': video_thumbnail.decode('utf-8'),
458 'description': video_description,
459 'player_url': player_url,
460 'subtitles': video_subtitles
465 class MetacafeIE(InfoExtractor):
466 """Information Extractor for metacafe.com."""
468 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
469 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
470 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
471 IE_NAME = u'metacafe'
473 def __init__(self, downloader=None):
474 InfoExtractor.__init__(self, downloader)
476 def report_disclaimer(self):
477 """Report disclaimer retrieval."""
478 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
480 def report_age_confirmation(self):
481 """Report attempt to confirm age."""
482 self._downloader.to_screen(u'[metacafe] Confirming age')
484 def report_download_webpage(self, video_id):
485 """Report webpage download."""
486 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
488 def report_extraction(self, video_id):
489 """Report information extraction."""
490 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
492 def _real_initialize(self):
493 # Retrieve disclaimer
494 request = urllib2.Request(self._DISCLAIMER)
496 self.report_disclaimer()
497 disclaimer = urllib2.urlopen(request).read()
498 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
499 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
505 'submit': "Continue - I'm over 18",
507 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
509 self.report_age_confirmation()
510 disclaimer = urllib2.urlopen(request).read()
511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
512 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
515 def _real_extract(self, url):
516 # Extract id and simplified title from URL
517 mobj = re.match(self._VALID_URL, url)
519 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
522 video_id = mobj.group(1)
524 # Check if video comes from YouTube
525 mobj2 = re.match(r'^yt-(.*)$', video_id)
526 if mobj2 is not None:
527 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
530 # Retrieve video webpage to extract further information
531 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
533 self.report_download_webpage(video_id)
534 webpage = urllib2.urlopen(request).read()
535 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
536 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
539 # Extract URL, uploader and title from webpage
540 self.report_extraction(video_id)
541 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
543 mediaURL = urllib.unquote(mobj.group(1))
544 video_extension = mediaURL[-3:]
546 # Extract gdaKey if available
547 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
551 gdaKey = mobj.group(1)
552 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
554 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
556 self._downloader.trouble(u'ERROR: unable to extract media URL')
558 vardict = parse_qs(mobj.group(1))
559 if 'mediaData' not in vardict:
560 self._downloader.trouble(u'ERROR: unable to extract media URL')
562 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
564 self._downloader.trouble(u'ERROR: unable to extract media URL')
566 mediaURL = mobj.group(1).replace('\\/', '/')
567 video_extension = mediaURL[-3:]
568 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
570 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
572 self._downloader.trouble(u'ERROR: unable to extract title')
574 video_title = mobj.group(1).decode('utf-8')
576 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
578 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
580 video_uploader = mobj.group(1)
583 'id': video_id.decode('utf-8'),
584 'url': video_url.decode('utf-8'),
585 'uploader': video_uploader.decode('utf-8'),
586 'upload_date': u'NA',
587 'title': video_title,
588 'ext': video_extension.decode('utf-8'),
594 class DailymotionIE(InfoExtractor):
595 """Information Extractor for Dailymotion"""
597 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
598 IE_NAME = u'dailymotion'
600 def __init__(self, downloader=None):
601 InfoExtractor.__init__(self, downloader)
603 def report_download_webpage(self, video_id):
604 """Report webpage download."""
605 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
607 def report_extraction(self, video_id):
608 """Report information extraction."""
609 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
611 def _real_extract(self, url):
612 # Extract id and simplified title from URL
613 mobj = re.match(self._VALID_URL, url)
615 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
618 video_id = mobj.group(1)
620 video_extension = 'mp4'
622 # Retrieve video webpage to extract further information
623 request = urllib2.Request(url)
624 request.add_header('Cookie', 'family_filter=off')
626 self.report_download_webpage(video_id)
627 webpage = urllib2.urlopen(request).read()
628 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
629 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
632 # Extract URL, uploader and title from webpage
633 self.report_extraction(video_id)
634 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
636 self._downloader.trouble(u'ERROR: unable to extract media URL')
638 flashvars = urllib.unquote(mobj.group(1))
639 mobj = re.search(r'"hqURL":"(.+?)"', flashvars)
641 self._downloader.trouble(u'ERROR: unable to extract media URL')
643 hqURL = mobj.group(1).replace('\\/', '/')
645 # TODO: support ldurl and sdurl qualities
647 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
649 self._downloader.trouble(u'ERROR: unable to extract title')
651 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
653 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
655 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
657 video_uploader = mobj.group(1)
660 'id': video_id.decode('utf-8'),
661 'url': hqURL.decode('utf-8'),
662 'uploader': video_uploader.decode('utf-8'),
663 'upload_date': u'NA',
664 'title': video_title,
665 'ext': video_extension.decode('utf-8'),
671 class GoogleIE(InfoExtractor):
672 """Information extractor for video.google.com."""
674 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
675 IE_NAME = u'video.google'
677 def __init__(self, downloader=None):
678 InfoExtractor.__init__(self, downloader)
680 def report_download_webpage(self, video_id):
681 """Report webpage download."""
682 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
684 def report_extraction(self, video_id):
685 """Report information extraction."""
686 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
688 def _real_extract(self, url):
689 # Extract id from URL
690 mobj = re.match(self._VALID_URL, url)
692 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
695 video_id = mobj.group(1)
697 video_extension = 'mp4'
699 # Retrieve video webpage to extract further information
700 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
702 self.report_download_webpage(video_id)
703 webpage = urllib2.urlopen(request).read()
704 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
705 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
708 # Extract URL, uploader, and title from webpage
709 self.report_extraction(video_id)
710 mobj = re.search(r"download_url:'([^']+)'", webpage)
712 video_extension = 'flv'
713 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
715 self._downloader.trouble(u'ERROR: unable to extract media URL')
717 mediaURL = urllib.unquote(mobj.group(1))
718 mediaURL = mediaURL.replace('\\x3d', '\x3d')
719 mediaURL = mediaURL.replace('\\x26', '\x26')
723 mobj = re.search(r'<title>(.*)</title>', webpage)
725 self._downloader.trouble(u'ERROR: unable to extract title')
727 video_title = mobj.group(1).decode('utf-8')
729 # Extract video description
730 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
732 self._downloader.trouble(u'ERROR: unable to extract video description')
734 video_description = mobj.group(1).decode('utf-8')
735 if not video_description:
736 video_description = 'No description available.'
738 # Extract video thumbnail
739 if self._downloader.params.get('forcethumbnail', False):
740 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
742 webpage = urllib2.urlopen(request).read()
743 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
744 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
746 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
748 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
750 video_thumbnail = mobj.group(1)
751 else: # we need something to pass to process_info
755 'id': video_id.decode('utf-8'),
756 'url': video_url.decode('utf-8'),
758 'upload_date': u'NA',
759 'title': video_title,
760 'ext': video_extension.decode('utf-8'),
766 class PhotobucketIE(InfoExtractor):
767 """Information extractor for photobucket.com."""
769 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
770 IE_NAME = u'photobucket'
772 def __init__(self, downloader=None):
773 InfoExtractor.__init__(self, downloader)
775 def report_download_webpage(self, video_id):
776 """Report webpage download."""
777 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
779 def report_extraction(self, video_id):
780 """Report information extraction."""
781 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
783 def _real_extract(self, url):
784 # Extract id from URL
785 mobj = re.match(self._VALID_URL, url)
787 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
790 video_id = mobj.group(1)
792 video_extension = 'flv'
794 # Retrieve video webpage to extract further information
795 request = urllib2.Request(url)
797 self.report_download_webpage(video_id)
798 webpage = urllib2.urlopen(request).read()
799 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
800 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
803 # Extract URL, uploader, and title from webpage
804 self.report_extraction(video_id)
805 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
807 self._downloader.trouble(u'ERROR: unable to extract media URL')
809 mediaURL = urllib.unquote(mobj.group(1))
813 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
815 self._downloader.trouble(u'ERROR: unable to extract title')
817 video_title = mobj.group(1).decode('utf-8')
819 video_uploader = mobj.group(2).decode('utf-8')
822 'id': video_id.decode('utf-8'),
823 'url': video_url.decode('utf-8'),
824 'uploader': video_uploader,
825 'upload_date': u'NA',
826 'title': video_title,
827 'ext': video_extension.decode('utf-8'),
833 class YahooIE(InfoExtractor):
834 """Information extractor for video.yahoo.com."""
836 # _VALID_URL matches all Yahoo! Video URLs
837 # _VPAGE_URL matches only the extractable '/watch/' URLs
838 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
839 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
840 IE_NAME = u'video.yahoo'
842 def __init__(self, downloader=None):
843 InfoExtractor.__init__(self, downloader)
845 def report_download_webpage(self, video_id):
846 """Report webpage download."""
847 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
849 def report_extraction(self, video_id):
850 """Report information extraction."""
851 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
853 def _real_extract(self, url, new_video=True):
854 # Extract ID from URL
855 mobj = re.match(self._VALID_URL, url)
857 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
860 video_id = mobj.group(2)
861 video_extension = 'flv'
863 # Rewrite valid but non-extractable URLs as
864 # extractable English language /watch/ URLs
865 if re.match(self._VPAGE_URL, url) is None:
866 request = urllib2.Request(url)
868 webpage = urllib2.urlopen(request).read()
869 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
870 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
873 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
875 self._downloader.trouble(u'ERROR: Unable to extract id field')
877 yahoo_id = mobj.group(1)
879 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
881 self._downloader.trouble(u'ERROR: Unable to extract vid field')
883 yahoo_vid = mobj.group(1)
885 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
886 return self._real_extract(url, new_video=False)
888 # Retrieve video webpage to extract further information
889 request = urllib2.Request(url)
891 self.report_download_webpage(video_id)
892 webpage = urllib2.urlopen(request).read()
893 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
894 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
897 # Extract uploader and title from webpage
898 self.report_extraction(video_id)
899 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
901 self._downloader.trouble(u'ERROR: unable to extract video title')
903 video_title = mobj.group(1).decode('utf-8')
905 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
907 self._downloader.trouble(u'ERROR: unable to extract video uploader')
909 video_uploader = mobj.group(1).decode('utf-8')
911 # Extract video thumbnail
912 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
914 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
916 video_thumbnail = mobj.group(1).decode('utf-8')
918 # Extract video description
919 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
921 self._downloader.trouble(u'ERROR: unable to extract video description')
923 video_description = mobj.group(1).decode('utf-8')
924 if not video_description:
925 video_description = 'No description available.'
927 # Extract video height and width
928 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
930 self._downloader.trouble(u'ERROR: unable to extract video height')
932 yv_video_height = mobj.group(1)
934 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
936 self._downloader.trouble(u'ERROR: unable to extract video width')
938 yv_video_width = mobj.group(1)
940 # Retrieve video playlist to extract media URL
941 # I'm not completely sure what all these options are, but we
942 # seem to need most of them, otherwise the server sends a 401.
943 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
944 yv_bitrate = '700' # according to Wikipedia this is hard-coded
945 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
946 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
947 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
949 self.report_download_webpage(video_id)
950 webpage = urllib2.urlopen(request).read()
951 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
952 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
955 # Extract media URL from playlist XML
956 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
958 self._downloader.trouble(u'ERROR: Unable to extract media URL')
960 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
961 video_url = unescapeHTML(video_url)
964 'id': video_id.decode('utf-8'),
966 'uploader': video_uploader,
967 'upload_date': u'NA',
968 'title': video_title,
969 'ext': video_extension.decode('utf-8'),
970 'thumbnail': video_thumbnail.decode('utf-8'),
971 'description': video_description,
972 'thumbnail': video_thumbnail,
977 class VimeoIE(InfoExtractor):
978 """Information extractor for vimeo.com."""
980 # _VALID_URL matches Vimeo URLs
981 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
984 def __init__(self, downloader=None):
985 InfoExtractor.__init__(self, downloader)
987 def report_download_webpage(self, video_id):
988 """Report webpage download."""
989 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
991 def report_extraction(self, video_id):
992 """Report information extraction."""
993 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
995 def _real_extract(self, url, new_video=True):
996 # Extract ID from URL
997 mobj = re.match(self._VALID_URL, url)
999 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1002 video_id = mobj.group(1)
1004 # Retrieve video webpage to extract further information
1005 request = urllib2.Request(url, None, std_headers)
1007 self.report_download_webpage(video_id)
1008 webpage = urllib2.urlopen(request).read()
1009 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1010 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1013 # Now we begin extracting as much information as we can from what we
1014 # retrieved. First we extract the information common to all extractors,
1015 # and latter we extract those that are Vimeo specific.
1016 self.report_extraction(video_id)
1018 # Extract the config JSON
1019 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1021 config = json.loads(config)
1023 self._downloader.trouble(u'ERROR: unable to extract info section')
1027 video_title = config["video"]["title"]
1030 video_uploader = config["video"]["owner"]["name"]
1032 # Extract video thumbnail
1033 video_thumbnail = config["video"]["thumbnail"]
1035 # Extract video description
1036 video_description = get_element_by_id("description", webpage.decode('utf8'))
1037 if video_description: video_description = clean_html(video_description)
1038 else: video_description = ''
1040 # Extract upload date
1041 video_upload_date = u'NA'
1042 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1043 if mobj is not None:
1044 video_upload_date = mobj.group(1)
1046 # Vimeo specific: extract request signature and timestamp
1047 sig = config['request']['signature']
1048 timestamp = config['request']['timestamp']
1050 # Vimeo specific: extract video codec and quality information
1051 # TODO bind to format param
1052 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1053 for codec in codecs:
1054 if codec[0] in config["video"]["files"]:
1055 video_codec = codec[0]
1056 video_extension = codec[1]
1057 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1058 else: quality = 'sd'
1061 self._downloader.trouble(u'ERROR: no known codec found')
1064 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1065 %(video_id, sig, timestamp, quality, video_codec.upper())
1070 'uploader': video_uploader,
1071 'upload_date': video_upload_date,
1072 'title': video_title,
1073 'ext': video_extension,
1074 'thumbnail': video_thumbnail,
1075 'description': video_description,
1080 class GenericIE(InfoExtractor):
1081 """Generic last-resort information extractor."""
1084 IE_NAME = u'generic'
1086 def __init__(self, downloader=None):
1087 InfoExtractor.__init__(self, downloader)
1089 def report_download_webpage(self, video_id):
1090 """Report webpage download."""
1091 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1092 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1094 def report_extraction(self, video_id):
1095 """Report information extraction."""
1096 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1098 def report_following_redirect(self, new_url):
1099 """Report information extraction."""
1100 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1102 def _test_redirect(self, url):
1103 """Check if it is a redirect, like url shorteners, in case restart chain."""
1104 class HeadRequest(urllib2.Request):
1105 def get_method(self):
1108 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1110 Subclass the HTTPRedirectHandler to make it use our
1111 HeadRequest also on the redirected URL
1113 def redirect_request(self, req, fp, code, msg, headers, newurl):
1114 if code in (301, 302, 303, 307):
1115 newurl = newurl.replace(' ', '%20')
1116 newheaders = dict((k,v) for k,v in req.headers.items()
1117 if k.lower() not in ("content-length", "content-type"))
1118 return HeadRequest(newurl,
1120 origin_req_host=req.get_origin_req_host(),
1123 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1125 class HTTPMethodFallback(urllib2.BaseHandler):
1127 Fallback to GET if HEAD is not allowed (405 HTTP error)
1129 def http_error_405(self, req, fp, code, msg, headers):
1133 newheaders = dict((k,v) for k,v in req.headers.items()
1134 if k.lower() not in ("content-length", "content-type"))
1135 return self.parent.open(urllib2.Request(req.get_full_url(),
1137 origin_req_host=req.get_origin_req_host(),
1141 opener = urllib2.OpenerDirector()
1142 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1143 HTTPMethodFallback, HEADRedirectHandler,
1144 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1145 opener.add_handler(handler())
1147 response = opener.open(HeadRequest(url))
1148 new_url = response.geturl()
1150 if url == new_url: return False
1152 self.report_following_redirect(new_url)
1153 self._downloader.download([new_url])
1156 def _real_extract(self, url):
1157 if self._test_redirect(url): return
1159 video_id = url.split('/')[-1]
1160 request = urllib2.Request(url)
1162 self.report_download_webpage(video_id)
1163 webpage = urllib2.urlopen(request).read()
1164 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1165 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1167 except ValueError, err:
1168 # since this is the last-resort InfoExtractor, if
1169 # this error is thrown, it'll be thrown here
1170 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1173 self.report_extraction(video_id)
1174 # Start with something easy: JW Player in SWFObject
1175 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1177 # Broaden the search a little bit
1178 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1180 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1183 # It's possible that one of the regexes
1184 # matched, but returned an empty group:
1185 if mobj.group(1) is None:
1186 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1189 video_url = urllib.unquote(mobj.group(1))
1190 video_id = os.path.basename(video_url)
1192 # here's a fun little line of code for you:
1193 video_extension = os.path.splitext(video_id)[1][1:]
1194 video_id = os.path.splitext(video_id)[0]
1196 # it's tempting to parse this further, but you would
1197 # have to take into account all the variations like
1198 # Video Title - Site Name
1199 # Site Name | Video Title
1200 # Video Title - Tagline | Site Name
1201 # and so on and so forth; it's just not practical
1202 mobj = re.search(r'<title>(.*)</title>', webpage)
1204 self._downloader.trouble(u'ERROR: unable to extract title')
1206 video_title = mobj.group(1).decode('utf-8')
1208 # video uploader is domain name
1209 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1211 self._downloader.trouble(u'ERROR: unable to extract title')
1213 video_uploader = mobj.group(1).decode('utf-8')
1216 'id': video_id.decode('utf-8'),
1217 'url': video_url.decode('utf-8'),
1218 'uploader': video_uploader,
1219 'upload_date': u'NA',
1220 'title': video_title,
1221 'ext': video_extension.decode('utf-8'),
1227 class YoutubeSearchIE(InfoExtractor):
1228 """Information Extractor for YouTube search queries."""
1229 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1230 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1231 _max_youtube_results = 1000
1232 IE_NAME = u'youtube:search'
1234 def __init__(self, downloader=None):
1235 InfoExtractor.__init__(self, downloader)
1237 def report_download_page(self, query, pagenum):
1238 """Report attempt to download search page with given number."""
1239 query = query.decode(preferredencoding())
1240 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1242 def _real_extract(self, query):
1243 mobj = re.match(self._VALID_URL, query)
1245 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1248 prefix, query = query.split(':')
1250 query = query.encode('utf-8')
1252 self._download_n_results(query, 1)
1254 elif prefix == 'all':
1255 self._download_n_results(query, self._max_youtube_results)
1261 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1263 elif n > self._max_youtube_results:
1264 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1265 n = self._max_youtube_results
1266 self._download_n_results(query, n)
1268 except ValueError: # parsing prefix as integer fails
1269 self._download_n_results(query, 1)
1272 def _download_n_results(self, query, n):
1273 """Downloads a specified number of results for a query"""
1279 while (50 * pagenum) < limit:
1280 self.report_download_page(query, pagenum+1)
1281 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1282 request = urllib2.Request(result_url)
1284 data = urllib2.urlopen(request).read()
1285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1286 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1288 api_response = json.loads(data)['data']
1290 new_ids = list(video['id'] for video in api_response['items'])
1291 video_ids += new_ids
1293 limit = min(n, api_response['totalItems'])
1296 if len(video_ids) > n:
1297 video_ids = video_ids[:n]
1298 for id in video_ids:
1299 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1303 class GoogleSearchIE(InfoExtractor):
1304 """Information Extractor for Google Video search queries."""
1305 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1306 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1307 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1308 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1309 _max_google_results = 1000
1310 IE_NAME = u'video.google:search'
1312 def __init__(self, downloader=None):
1313 InfoExtractor.__init__(self, downloader)
1315 def report_download_page(self, query, pagenum):
1316 """Report attempt to download playlist page with given number."""
1317 query = query.decode(preferredencoding())
1318 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1320 def _real_extract(self, query):
1321 mobj = re.match(self._VALID_URL, query)
1323 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1326 prefix, query = query.split(':')
1328 query = query.encode('utf-8')
1330 self._download_n_results(query, 1)
1332 elif prefix == 'all':
1333 self._download_n_results(query, self._max_google_results)
1339 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1341 elif n > self._max_google_results:
1342 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1343 n = self._max_google_results
1344 self._download_n_results(query, n)
1346 except ValueError: # parsing prefix as integer fails
1347 self._download_n_results(query, 1)
1350 def _download_n_results(self, query, n):
1351 """Downloads a specified number of results for a query"""
1357 self.report_download_page(query, pagenum)
1358 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1359 request = urllib2.Request(result_url)
1361 page = urllib2.urlopen(request).read()
1362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1366 # Extract video identifiers
1367 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1368 video_id = mobj.group(1)
1369 if video_id not in video_ids:
1370 video_ids.append(video_id)
1371 if len(video_ids) == n:
1372 # Specified n videos reached
1373 for id in video_ids:
1374 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1377 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1378 for id in video_ids:
1379 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1382 pagenum = pagenum + 1
1385 class YahooSearchIE(InfoExtractor):
1386 """Information Extractor for Yahoo! Video search queries."""
1387 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1388 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1389 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1390 _MORE_PAGES_INDICATOR = r'\s*Next'
1391 _max_yahoo_results = 1000
1392 IE_NAME = u'video.yahoo:search'
1394 def __init__(self, downloader=None):
1395 InfoExtractor.__init__(self, downloader)
1397 def report_download_page(self, query, pagenum):
1398 """Report attempt to download playlist page with given number."""
1399 query = query.decode(preferredencoding())
1400 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1402 def _real_extract(self, query):
1403 mobj = re.match(self._VALID_URL, query)
1405 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1408 prefix, query = query.split(':')
1410 query = query.encode('utf-8')
1412 self._download_n_results(query, 1)
1414 elif prefix == 'all':
1415 self._download_n_results(query, self._max_yahoo_results)
1421 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1423 elif n > self._max_yahoo_results:
1424 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1425 n = self._max_yahoo_results
1426 self._download_n_results(query, n)
1428 except ValueError: # parsing prefix as integer fails
1429 self._download_n_results(query, 1)
1432 def _download_n_results(self, query, n):
1433 """Downloads a specified number of results for a query"""
1436 already_seen = set()
1440 self.report_download_page(query, pagenum)
1441 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1442 request = urllib2.Request(result_url)
1444 page = urllib2.urlopen(request).read()
1445 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1446 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1449 # Extract video identifiers
1450 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1451 video_id = mobj.group(1)
1452 if video_id not in already_seen:
1453 video_ids.append(video_id)
1454 already_seen.add(video_id)
1455 if len(video_ids) == n:
1456 # Specified n videos reached
1457 for id in video_ids:
1458 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1461 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1462 for id in video_ids:
1463 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1466 pagenum = pagenum + 1
1469 class YoutubePlaylistIE(InfoExtractor):
1470 """Information Extractor for YouTube playlists."""
1472 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1473 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1474 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=.*?%s'
1475 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1476 IE_NAME = u'youtube:playlist'
1478 def __init__(self, downloader=None):
1479 InfoExtractor.__init__(self, downloader)
1481 def report_download_page(self, playlist_id, pagenum):
1482 """Report attempt to download playlist page with given number."""
1483 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1485 def _real_extract(self, url):
1486 # Extract playlist id
1487 mobj = re.match(self._VALID_URL, url)
1489 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1493 if mobj.group(3) is not None:
1494 self._downloader.download([mobj.group(3)])
1497 # Download playlist pages
1498 # prefix is 'p' as default for playlists but there are other types that need extra care
1499 playlist_prefix = mobj.group(1)
1500 if playlist_prefix == 'a':
1501 playlist_access = 'artist'
1503 playlist_prefix = 'p'
1504 playlist_access = 'view_play_list'
1505 playlist_id = mobj.group(2)
1510 self.report_download_page(playlist_id, pagenum)
1511 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1512 request = urllib2.Request(url)
1514 page = urllib2.urlopen(request).read()
1515 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1516 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1519 # Extract video identifiers
1521 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1522 if mobj.group(1) not in ids_in_page:
1523 ids_in_page.append(mobj.group(1))
1524 video_ids.extend(ids_in_page)
1526 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1528 pagenum = pagenum + 1
1530 playliststart = self._downloader.params.get('playliststart', 1) - 1
1531 playlistend = self._downloader.params.get('playlistend', -1)
1532 if playlistend == -1:
1533 video_ids = video_ids[playliststart:]
1535 video_ids = video_ids[playliststart:playlistend]
1537 for id in video_ids:
1538 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1542 class YoutubeUserIE(InfoExtractor):
1543 """Information Extractor for YouTube users."""
1545 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1546 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1547 _GDATA_PAGE_SIZE = 50
1548 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1549 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1550 IE_NAME = u'youtube:user'
1552 def __init__(self, downloader=None):
1553 InfoExtractor.__init__(self, downloader)
1555 def report_download_page(self, username, start_index):
1556 """Report attempt to download user page."""
1557 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1558 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1560 def _real_extract(self, url):
1562 mobj = re.match(self._VALID_URL, url)
1564 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1567 username = mobj.group(1)
1569 # Download video ids using YouTube Data API. Result size per
1570 # query is limited (currently to 50 videos) so we need to query
1571 # page by page until there are no video ids - it means we got
1578 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1579 self.report_download_page(username, start_index)
1581 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1584 page = urllib2.urlopen(request).read()
1585 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1589 # Extract video identifiers
1592 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1593 if mobj.group(1) not in ids_in_page:
1594 ids_in_page.append(mobj.group(1))
1596 video_ids.extend(ids_in_page)
1598 # A little optimization - if current page is not
1599 # "full", ie. does not contain PAGE_SIZE video ids then
1600 # we can assume that this page is the last one - there
1601 # are no more ids on further pages - no need to query
1604 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1609 all_ids_count = len(video_ids)
1610 playliststart = self._downloader.params.get('playliststart', 1) - 1
1611 playlistend = self._downloader.params.get('playlistend', -1)
1613 if playlistend == -1:
1614 video_ids = video_ids[playliststart:]
1616 video_ids = video_ids[playliststart:playlistend]
1618 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1619 (username, all_ids_count, len(video_ids)))
1621 for video_id in video_ids:
1622 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1625 class BlipTVUserIE(InfoExtractor):
1626 """Information Extractor for blip.tv users."""
1628 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1630 IE_NAME = u'blip.tv:user'
1632 def __init__(self, downloader=None):
1633 InfoExtractor.__init__(self, downloader)
1635 def report_download_page(self, username, pagenum):
1636 """Report attempt to download user page."""
1637 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1638 (self.IE_NAME, username, pagenum))
1640 def _real_extract(self, url):
1642 mobj = re.match(self._VALID_URL, url)
1644 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1647 username = mobj.group(1)
1649 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1651 request = urllib2.Request(url)
1654 page = urllib2.urlopen(request).read().decode('utf-8')
1655 mobj = re.search(r'data-users-id="([^"]+)"', page)
1656 page_base = page_base % mobj.group(1)
1657 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1658 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1662 # Download video ids using BlipTV Ajax calls. Result size per
1663 # query is limited (currently to 12 videos) so we need to query
1664 # page by page until there are no video ids - it means we got
1671 self.report_download_page(username, pagenum)
1673 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1676 page = urllib2.urlopen(request).read().decode('utf-8')
1677 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1678 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1681 # Extract video identifiers
1684 for mobj in re.finditer(r'href="/([^"]+)"', page):
1685 if mobj.group(1) not in ids_in_page:
1686 ids_in_page.append(unescapeHTML(mobj.group(1)))
1688 video_ids.extend(ids_in_page)
1690 # A little optimization - if current page is not
1691 # "full", ie. does not contain PAGE_SIZE video ids then
1692 # we can assume that this page is the last one - there
1693 # are no more ids on further pages - no need to query
1696 if len(ids_in_page) < self._PAGE_SIZE:
1701 all_ids_count = len(video_ids)
1702 playliststart = self._downloader.params.get('playliststart', 1) - 1
1703 playlistend = self._downloader.params.get('playlistend', -1)
1705 if playlistend == -1:
1706 video_ids = video_ids[playliststart:]
1708 video_ids = video_ids[playliststart:playlistend]
1710 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1711 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1713 for video_id in video_ids:
1714 self._downloader.download([u'http://blip.tv/'+video_id])
1717 class DepositFilesIE(InfoExtractor):
1718 """Information extractor for depositfiles.com"""
1720 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1721 IE_NAME = u'DepositFiles'
1723 def __init__(self, downloader=None):
1724 InfoExtractor.__init__(self, downloader)
1726 def report_download_webpage(self, file_id):
1727 """Report webpage download."""
1728 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1730 def report_extraction(self, file_id):
1731 """Report information extraction."""
1732 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1734 def _real_extract(self, url):
1735 file_id = url.split('/')[-1]
1736 # Rebuild url in english locale
1737 url = 'http://depositfiles.com/en/files/' + file_id
1739 # Retrieve file webpage with 'Free download' button pressed
1740 free_download_indication = { 'gateway_result' : '1' }
1741 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1743 self.report_download_webpage(file_id)
1744 webpage = urllib2.urlopen(request).read()
1745 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1746 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1749 # Search for the real file URL
1750 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1751 if (mobj is None) or (mobj.group(1) is None):
1752 # Try to figure out reason of the error.
1753 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1754 if (mobj is not None) and (mobj.group(1) is not None):
1755 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1756 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1758 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1761 file_url = mobj.group(1)
1762 file_extension = os.path.splitext(file_url)[1][1:]
1764 # Search for file title
1765 mobj = re.search(r'<b title="(.*?)">', webpage)
1767 self._downloader.trouble(u'ERROR: unable to extract title')
1769 file_title = mobj.group(1).decode('utf-8')
1772 'id': file_id.decode('utf-8'),
1773 'url': file_url.decode('utf-8'),
1775 'upload_date': u'NA',
1776 'title': file_title,
1777 'ext': file_extension.decode('utf-8'),
1783 class FacebookIE(InfoExtractor):
1784 """Information Extractor for Facebook"""
1786 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1787 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1788 _NETRC_MACHINE = 'facebook'
1789 _available_formats = ['video', 'highqual', 'lowqual']
1790 _video_extensions = {
1795 IE_NAME = u'facebook'
1797 def __init__(self, downloader=None):
1798 InfoExtractor.__init__(self, downloader)
1800 def _reporter(self, message):
1801 """Add header and report message."""
1802 self._downloader.to_screen(u'[facebook] %s' % message)
1804 def report_login(self):
1805 """Report attempt to log in."""
1806 self._reporter(u'Logging in')
1808 def report_video_webpage_download(self, video_id):
1809 """Report attempt to download video webpage."""
1810 self._reporter(u'%s: Downloading video webpage' % video_id)
1812 def report_information_extraction(self, video_id):
1813 """Report attempt to extract video information."""
1814 self._reporter(u'%s: Extracting video information' % video_id)
1816 def _parse_page(self, video_webpage):
1817 """Extract video information from page"""
1819 data = {'title': r'\("video_title", "(.*?)"\)',
1820 'description': r'<div class="datawrap">(.*?)</div>',
1821 'owner': r'\("video_owner_name", "(.*?)"\)',
1822 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1825 for piece in data.keys():
1826 mobj = re.search(data[piece], video_webpage)
1827 if mobj is not None:
1828 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1832 for fmt in self._available_formats:
1833 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1834 if mobj is not None:
1835 # URL is in a Javascript segment inside an escaped Unicode format within
1836 # the generally utf-8 page
1837 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1838 video_info['video_urls'] = video_urls
1842 def _real_initialize(self):
1843 if self._downloader is None:
1848 downloader_params = self._downloader.params
1850 # Attempt to use provided username and password or .netrc data
1851 if downloader_params.get('username', None) is not None:
1852 useremail = downloader_params['username']
1853 password = downloader_params['password']
1854 elif downloader_params.get('usenetrc', False):
1856 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1857 if info is not None:
1861 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1862 except (IOError, netrc.NetrcParseError), err:
1863 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1866 if useremail is None:
1875 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1878 login_results = urllib2.urlopen(request).read()
1879 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1880 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1882 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1883 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1886 def _real_extract(self, url):
1887 mobj = re.match(self._VALID_URL, url)
1889 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1891 video_id = mobj.group('ID')
1894 self.report_video_webpage_download(video_id)
1895 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1897 page = urllib2.urlopen(request)
1898 video_webpage = page.read()
1899 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1900 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1903 # Start extracting information
1904 self.report_information_extraction(video_id)
1906 # Extract information
1907 video_info = self._parse_page(video_webpage)
1910 if 'owner' not in video_info:
1911 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1913 video_uploader = video_info['owner']
1916 if 'title' not in video_info:
1917 self._downloader.trouble(u'ERROR: unable to extract video title')
1919 video_title = video_info['title']
1920 video_title = video_title.decode('utf-8')
1923 if 'thumbnail' not in video_info:
1924 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1925 video_thumbnail = ''
1927 video_thumbnail = video_info['thumbnail']
1931 if 'upload_date' in video_info:
1932 upload_time = video_info['upload_date']
1933 timetuple = email.utils.parsedate_tz(upload_time)
1934 if timetuple is not None:
1936 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1941 video_description = video_info.get('description', 'No description available.')
1943 url_map = video_info['video_urls']
1944 if len(url_map.keys()) > 0:
1945 # Decide which formats to download
1946 req_format = self._downloader.params.get('format', None)
1947 format_limit = self._downloader.params.get('format_limit', None)
1949 if format_limit is not None and format_limit in self._available_formats:
1950 format_list = self._available_formats[self._available_formats.index(format_limit):]
1952 format_list = self._available_formats
1953 existing_formats = [x for x in format_list if x in url_map]
1954 if len(existing_formats) == 0:
1955 self._downloader.trouble(u'ERROR: no known formats available for video')
1957 if req_format is None:
1958 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1959 elif req_format == 'worst':
1960 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1961 elif req_format == '-1':
1962 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1965 if req_format not in url_map:
1966 self._downloader.trouble(u'ERROR: requested format not available')
1968 video_url_list = [(req_format, url_map[req_format])] # Specific format
1971 for format_param, video_real_url in video_url_list:
1973 video_extension = self._video_extensions.get(format_param, 'mp4')
1976 'id': video_id.decode('utf-8'),
1977 'url': video_real_url.decode('utf-8'),
1978 'uploader': video_uploader.decode('utf-8'),
1979 'upload_date': upload_date,
1980 'title': video_title,
1981 'ext': video_extension.decode('utf-8'),
1982 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1983 'thumbnail': video_thumbnail.decode('utf-8'),
1984 'description': video_description.decode('utf-8'),
1989 class BlipTVIE(InfoExtractor):
1990 """Information extractor for blip.tv"""
1992 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1993 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1994 IE_NAME = u'blip.tv'
1996 def report_extraction(self, file_id):
1997 """Report information extraction."""
1998 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2000 def report_direct_download(self, title):
2001 """Report information extraction."""
2002 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2004 def _real_extract(self, url):
2005 mobj = re.match(self._VALID_URL, url)
2007 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2014 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2015 request = urllib2.Request(json_url.encode('utf-8'))
2016 self.report_extraction(mobj.group(1))
2019 urlh = urllib2.urlopen(request)
2020 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2021 basename = url.split('/')[-1]
2022 title,ext = os.path.splitext(basename)
2023 title = title.decode('UTF-8')
2024 ext = ext.replace('.', '')
2025 self.report_direct_download(title)
2033 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2034 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2036 if info is None: # Regular URL
2038 json_code = urlh.read()
2039 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2040 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2044 json_data = json.loads(json_code)
2045 if 'Post' in json_data:
2046 data = json_data['Post']
2050 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2051 video_url = data['media']['url']
2052 umobj = re.match(self._URL_EXT, video_url)
2054 raise ValueError('Can not determine filename extension')
2055 ext = umobj.group(1)
2058 'id': data['item_id'],
2060 'uploader': data['display_name'],
2061 'upload_date': upload_date,
2062 'title': data['title'],
2064 'format': data['media']['mimeType'],
2065 'thumbnail': data['thumbnailUrl'],
2066 'description': data['description'],
2067 'player_url': data['embedUrl']
2069 except (ValueError,KeyError), err:
2070 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2073 std_headers['User-Agent'] = 'iTunes/10.6.1'
2077 class MyVideoIE(InfoExtractor):
2078 """Information Extractor for myvideo.de."""
2080 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2081 IE_NAME = u'myvideo'
2083 def __init__(self, downloader=None):
2084 InfoExtractor.__init__(self, downloader)
2086 def report_download_webpage(self, video_id):
2087 """Report webpage download."""
2088 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2090 def report_extraction(self, video_id):
2091 """Report information extraction."""
2092 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2094 def _real_extract(self,url):
2095 mobj = re.match(self._VALID_URL, url)
2097 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2100 video_id = mobj.group(1)
2103 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2105 self.report_download_webpage(video_id)
2106 webpage = urllib2.urlopen(request).read()
2107 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2108 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2111 self.report_extraction(video_id)
2112 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2115 self._downloader.trouble(u'ERROR: unable to extract media URL')
2117 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2119 mobj = re.search('<title>([^<]+)</title>', webpage)
2121 self._downloader.trouble(u'ERROR: unable to extract title')
2124 video_title = mobj.group(1)
2130 'upload_date': u'NA',
2131 'title': video_title,
2137 class ComedyCentralIE(InfoExtractor):
2138 """Information extractor for The Daily Show and Colbert Report """
2140 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2141 IE_NAME = u'comedycentral'
2143 def report_extraction(self, episode_id):
2144 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2146 def report_config_download(self, episode_id):
2147 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2149 def report_index_download(self, episode_id):
2150 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2152 def report_player_url(self, episode_id):
2153 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2155 def _real_extract(self, url):
2156 mobj = re.match(self._VALID_URL, url)
2158 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2161 if mobj.group('shortname'):
2162 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2163 url = u'http://www.thedailyshow.com/full-episodes/'
2165 url = u'http://www.colbertnation.com/full-episodes/'
2166 mobj = re.match(self._VALID_URL, url)
2167 assert mobj is not None
2169 dlNewest = not mobj.group('episode')
2171 epTitle = mobj.group('showname')
2173 epTitle = mobj.group('episode')
2175 req = urllib2.Request(url)
2176 self.report_extraction(epTitle)
2178 htmlHandle = urllib2.urlopen(req)
2179 html = htmlHandle.read()
2180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2181 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2184 url = htmlHandle.geturl()
2185 mobj = re.match(self._VALID_URL, url)
2187 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2189 if mobj.group('episode') == '':
2190 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2192 epTitle = mobj.group('episode')
2194 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2195 if len(mMovieParams) == 0:
2196 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2199 playerUrl_raw = mMovieParams[0][0]
2200 self.report_player_url(epTitle)
2202 urlHandle = urllib2.urlopen(playerUrl_raw)
2203 playerUrl = urlHandle.geturl()
2204 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2208 uri = mMovieParams[0][1]
2209 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2210 self.report_index_download(epTitle)
2212 indexXml = urllib2.urlopen(indexUrl).read()
2213 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2214 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2219 idoc = xml.etree.ElementTree.fromstring(indexXml)
2220 itemEls = idoc.findall('.//item')
2221 for itemEl in itemEls:
2222 mediaId = itemEl.findall('./guid')[0].text
2223 shortMediaId = mediaId.split(':')[-1]
2224 showId = mediaId.split(':')[-2].replace('.com', '')
2225 officialTitle = itemEl.findall('./title')[0].text
2226 officialDate = itemEl.findall('./pubDate')[0].text
2228 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2229 urllib.urlencode({'uri': mediaId}))
2230 configReq = urllib2.Request(configUrl)
2231 self.report_config_download(epTitle)
2233 configXml = urllib2.urlopen(configReq).read()
2234 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2235 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2238 cdoc = xml.etree.ElementTree.fromstring(configXml)
2240 for rendition in cdoc.findall('.//rendition'):
2241 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2245 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2248 # For now, just pick the highest bitrate
2249 format,video_url = turls[-1]
2251 effTitle = showId + u'-' + epTitle
2256 'upload_date': officialDate,
2261 'description': officialTitle,
2262 'player_url': playerUrl
2265 results.append(info)
2270 class EscapistIE(InfoExtractor):
2271 """Information extractor for The Escapist """
2273 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2274 IE_NAME = u'escapist'
2276 def report_extraction(self, showName):
2277 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2279 def report_config_download(self, showName):
2280 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2282 def _real_extract(self, url):
2283 mobj = re.match(self._VALID_URL, url)
2285 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2287 showName = mobj.group('showname')
2288 videoId = mobj.group('episode')
2290 self.report_extraction(showName)
2292 webPage = urllib2.urlopen(url)
2293 webPageBytes = webPage.read()
2294 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2295 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2296 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2297 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2300 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2301 description = unescapeHTML(descMatch.group(1))
2302 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2303 imgUrl = unescapeHTML(imgMatch.group(1))
2304 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2305 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2306 configUrlMatch = re.search('config=(.*)$', playerUrl)
2307 configUrl = urllib2.unquote(configUrlMatch.group(1))
2309 self.report_config_download(showName)
2311 configJSON = urllib2.urlopen(configUrl).read()
2312 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2313 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2316 # Technically, it's JavaScript, not JSON
2317 configJSON = configJSON.replace("'", '"')
2320 config = json.loads(configJSON)
2321 except (ValueError,), err:
2322 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2325 playlist = config['playlist']
2326 videoUrl = playlist[1]['url']
2331 'uploader': showName,
2332 'upload_date': None,
2336 'thumbnail': imgUrl,
2337 'description': description,
2338 'player_url': playerUrl,
2344 class CollegeHumorIE(InfoExtractor):
2345 """Information extractor for collegehumor.com"""
2347 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2348 IE_NAME = u'collegehumor'
2350 def report_webpage(self, video_id):
2351 """Report information extraction."""
2352 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2354 def report_extraction(self, video_id):
2355 """Report information extraction."""
2356 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2358 def _real_extract(self, url):
2359 mobj = re.match(self._VALID_URL, url)
2361 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2363 video_id = mobj.group('videoid')
2365 self.report_webpage(video_id)
2366 request = urllib2.Request(url)
2368 webpage = urllib2.urlopen(request).read()
2369 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2370 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2373 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2375 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2377 internal_video_id = m.group('internalvideoid')
2381 'internal_id': internal_video_id,
2384 self.report_extraction(video_id)
2385 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2387 metaXml = urllib2.urlopen(xmlUrl).read()
2388 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2389 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2392 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2394 videoNode = mdoc.findall('./video')[0]
2395 info['description'] = videoNode.findall('./description')[0].text
2396 info['title'] = videoNode.findall('./caption')[0].text
2397 info['url'] = videoNode.findall('./file')[0].text
2398 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2399 info['ext'] = info['url'].rpartition('.')[2]
2400 info['format'] = info['ext']
2402 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2408 class XVideosIE(InfoExtractor):
2409 """Information extractor for xvideos.com"""
2411 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2412 IE_NAME = u'xvideos'
2414 def report_webpage(self, video_id):
2415 """Report information extraction."""
2416 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2418 def report_extraction(self, video_id):
2419 """Report information extraction."""
2420 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2422 def _real_extract(self, url):
2423 mobj = re.match(self._VALID_URL, url)
2425 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2427 video_id = mobj.group(1).decode('utf-8')
2429 self.report_webpage(video_id)
2431 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2433 webpage = urllib2.urlopen(request).read()
2434 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2435 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2438 self.report_extraction(video_id)
2442 mobj = re.search(r'flv_url=(.+?)&', webpage)
2444 self._downloader.trouble(u'ERROR: unable to extract video url')
2446 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2450 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2452 self._downloader.trouble(u'ERROR: unable to extract video title')
2454 video_title = mobj.group(1).decode('utf-8')
2457 # Extract video thumbnail
2458 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2460 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2462 video_thumbnail = mobj.group(0).decode('utf-8')
2468 'upload_date': None,
2469 'title': video_title,
2472 'thumbnail': video_thumbnail,
2473 'description': None,
2480 class SoundcloudIE(InfoExtractor):
2481 """Information extractor for soundcloud.com
2482 To access the media, the uid of the song and a stream token
2483 must be extracted from the page source and the script must make
2484 a request to media.soundcloud.com/crossdomain.xml. Then
2485 the media can be grabbed by requesting from an url composed
2486 of the stream token and uid
2489 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2490 IE_NAME = u'soundcloud'
2492 def __init__(self, downloader=None):
2493 InfoExtractor.__init__(self, downloader)
2495 def report_webpage(self, video_id):
2496 """Report information extraction."""
2497 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2499 def report_extraction(self, video_id):
2500 """Report information extraction."""
2501 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2503 def _real_extract(self, url):
2504 mobj = re.match(self._VALID_URL, url)
2506 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2509 # extract uploader (which is in the url)
2510 uploader = mobj.group(1).decode('utf-8')
2511 # extract simple title (uploader + slug of song title)
2512 slug_title = mobj.group(2).decode('utf-8')
2513 simple_title = uploader + u'-' + slug_title
2515 self.report_webpage('%s/%s' % (uploader, slug_title))
2517 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2519 webpage = urllib2.urlopen(request).read()
2520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2521 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2524 self.report_extraction('%s/%s' % (uploader, slug_title))
2526 # extract uid and stream token that soundcloud hands out for access
2527 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2529 video_id = mobj.group(1)
2530 stream_token = mobj.group(2)
2532 # extract unsimplified title
2533 mobj = re.search('"title":"(.*?)",', webpage)
2535 title = mobj.group(1).decode('utf-8')
2537 title = simple_title
2539 # construct media url (with uid/token)
2540 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2541 mediaURL = mediaURL % (video_id, stream_token)
2544 description = u'No description available'
2545 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2547 description = mobj.group(1)
2551 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2554 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2555 except Exception, e:
2556 self._downloader.to_stderr(str(e))
2558 # for soundcloud, a request to a cross domain is required for cookies
2559 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2562 'id': video_id.decode('utf-8'),
2564 'uploader': uploader.decode('utf-8'),
2565 'upload_date': upload_date,
2570 'description': description.decode('utf-8')
2574 class InfoQIE(InfoExtractor):
2575 """Information extractor for infoq.com"""
2577 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2580 def report_webpage(self, video_id):
2581 """Report information extraction."""
2582 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2584 def report_extraction(self, video_id):
2585 """Report information extraction."""
2586 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2588 def _real_extract(self, url):
2589 mobj = re.match(self._VALID_URL, url)
2591 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2594 self.report_webpage(url)
2596 request = urllib2.Request(url)
2598 webpage = urllib2.urlopen(request).read()
2599 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2600 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2603 self.report_extraction(url)
2607 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2609 self._downloader.trouble(u'ERROR: unable to extract video url')
2611 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2615 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2617 self._downloader.trouble(u'ERROR: unable to extract video title')
2619 video_title = mobj.group(1).decode('utf-8')
2621 # Extract description
2622 video_description = u'No description available.'
2623 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2624 if mobj is not None:
2625 video_description = mobj.group(1).decode('utf-8')
2627 video_filename = video_url.split('/')[-1]
2628 video_id, extension = video_filename.split('.')
2634 'upload_date': None,
2635 'title': video_title,
2637 'format': extension, # Extension is always(?) mp4, but seems to be flv
2639 'description': video_description,
2645 class MixcloudIE(InfoExtractor):
2646 """Information extractor for www.mixcloud.com"""
2647 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2648 IE_NAME = u'mixcloud'
2650 def __init__(self, downloader=None):
2651 InfoExtractor.__init__(self, downloader)
2653 def report_download_json(self, file_id):
2654 """Report JSON download."""
2655 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2657 def report_extraction(self, file_id):
2658 """Report information extraction."""
2659 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2661 def get_urls(self, jsonData, fmt, bitrate='best'):
2662 """Get urls from 'audio_formats' section in json"""
2665 bitrate_list = jsonData[fmt]
2666 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2667 bitrate = max(bitrate_list) # select highest
2669 url_list = jsonData[fmt][bitrate]
2670 except TypeError: # we have no bitrate info.
2671 url_list = jsonData[fmt]
2674 def check_urls(self, url_list):
2675 """Returns 1st active url from list"""
2676 for url in url_list:
2678 urllib2.urlopen(url)
2680 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2685 def _print_formats(self, formats):
2686 print 'Available formats:'
2687 for fmt in formats.keys():
2688 for b in formats[fmt]:
2690 ext = formats[fmt][b][0]
2691 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2692 except TypeError: # we have no bitrate info
2693 ext = formats[fmt][0]
2694 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2697 def _real_extract(self, url):
2698 mobj = re.match(self._VALID_URL, url)
2700 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2702 # extract uploader & filename from url
2703 uploader = mobj.group(1).decode('utf-8')
2704 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2706 # construct API request
2707 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2708 # retrieve .json file with links to files
2709 request = urllib2.Request(file_url)
2711 self.report_download_json(file_url)
2712 jsonData = urllib2.urlopen(request).read()
2713 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2714 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2718 json_data = json.loads(jsonData)
2719 player_url = json_data['player_swf_url']
2720 formats = dict(json_data['audio_formats'])
2722 req_format = self._downloader.params.get('format', None)
2725 if self._downloader.params.get('listformats', None):
2726 self._print_formats(formats)
2729 if req_format is None or req_format == 'best':
2730 for format_param in formats.keys():
2731 url_list = self.get_urls(formats, format_param)
2733 file_url = self.check_urls(url_list)
2734 if file_url is not None:
2737 if req_format not in formats.keys():
2738 self._downloader.trouble(u'ERROR: format is not available')
2741 url_list = self.get_urls(formats, req_format)
2742 file_url = self.check_urls(url_list)
2743 format_param = req_format
2746 'id': file_id.decode('utf-8'),
2747 'url': file_url.decode('utf-8'),
2748 'uploader': uploader.decode('utf-8'),
2749 'upload_date': u'NA',
2750 'title': json_data['name'],
2751 'ext': file_url.split('.')[-1].decode('utf-8'),
2752 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2753 'thumbnail': json_data['thumbnail_url'],
2754 'description': json_data['description'],
2755 'player_url': player_url.decode('utf-8'),
2758 class StanfordOpenClassroomIE(InfoExtractor):
2759 """Information extractor for Stanford's Open ClassRoom"""
2761 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2762 IE_NAME = u'stanfordoc'
2764 def report_download_webpage(self, objid):
2765 """Report information extraction."""
2766 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2768 def report_extraction(self, video_id):
2769 """Report information extraction."""
2770 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2772 def _real_extract(self, url):
2773 mobj = re.match(self._VALID_URL, url)
2775 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2778 if mobj.group('course') and mobj.group('video'): # A specific video
2779 course = mobj.group('course')
2780 video = mobj.group('video')
2782 'id': course + '_' + video,
2785 self.report_extraction(info['id'])
2786 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2787 xmlUrl = baseUrl + video + '.xml'
2789 metaXml = urllib2.urlopen(xmlUrl).read()
2790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2791 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2793 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2795 info['title'] = mdoc.findall('./title')[0].text
2796 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2798 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2800 info['ext'] = info['url'].rpartition('.')[2]
2801 info['format'] = info['ext']
2803 elif mobj.group('course'): # A course page
2804 course = mobj.group('course')
2810 self.report_download_webpage(info['id'])
2812 coursepage = urllib2.urlopen(url).read()
2813 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2814 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2817 m = re.search('<h1>([^<]+)</h1>', coursepage)
2819 info['title'] = unescapeHTML(m.group(1))
2821 info['title'] = info['id']
2823 m = re.search('<description>([^<]+)</description>', coursepage)
2825 info['description'] = unescapeHTML(m.group(1))
2827 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2830 'type': 'reference',
2831 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2835 for entry in info['list']:
2836 assert entry['type'] == 'reference'
2837 results += self.extract(entry['url'])
2842 'id': 'Stanford OpenClassroom',
2846 self.report_download_webpage(info['id'])
2847 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2849 rootpage = urllib2.urlopen(rootURL).read()
2850 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2851 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2854 info['title'] = info['id']
2856 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2859 'type': 'reference',
2860 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2865 for entry in info['list']:
2866 assert entry['type'] == 'reference'
2867 results += self.extract(entry['url'])
2870 class MTVIE(InfoExtractor):
2871 """Information extractor for MTV.com"""
2873 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2876 def report_webpage(self, video_id):
2877 """Report information extraction."""
2878 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2880 def report_extraction(self, video_id):
2881 """Report information extraction."""
2882 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2884 def _real_extract(self, url):
2885 mobj = re.match(self._VALID_URL, url)
2887 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2889 if not mobj.group('proto'):
2890 url = 'http://' + url
2891 video_id = mobj.group('videoid')
2892 self.report_webpage(video_id)
2894 request = urllib2.Request(url)
2896 webpage = urllib2.urlopen(request).read()
2897 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2898 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2901 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2903 self._downloader.trouble(u'ERROR: unable to extract song name')
2905 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2906 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2908 self._downloader.trouble(u'ERROR: unable to extract performer')
2910 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2911 video_title = performer + ' - ' + song_name
2913 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2915 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2917 mtvn_uri = mobj.group(1)
2919 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2921 self._downloader.trouble(u'ERROR: unable to extract content id')
2923 content_id = mobj.group(1)
2925 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2926 self.report_extraction(video_id)
2927 request = urllib2.Request(videogen_url)
2929 metadataXml = urllib2.urlopen(request).read()
2930 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2931 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2934 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2935 renditions = mdoc.findall('.//rendition')
2937 # For now, always pick the highest quality.
2938 rendition = renditions[-1]
2941 _,_,ext = rendition.attrib['type'].partition('/')
2942 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2943 video_url = rendition.find('./src').text
2945 self._downloader.trouble('Invalid rendition field.')
2951 'uploader': performer,
2952 'title': video_title,
2960 class YoukuIE(InfoExtractor):
2962 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2965 def __init__(self, downloader=None):
2966 InfoExtractor.__init__(self, downloader)
2968 def report_download_webpage(self, file_id):
2969 """Report webpage download."""
2970 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
2972 def report_extraction(self, file_id):
2973 """Report information extraction."""
2974 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
2977 nowTime = int(time.time() * 1000)
2978 random1 = random.randint(1000,1998)
2979 random2 = random.randint(1000,9999)
2981 return "%d%d%d" %(nowTime,random1,random2)
2983 def _get_file_ID_mix_string(self, seed):
2985 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2987 for i in range(len(source)):
2988 seed = (seed * 211 + 30031 ) % 65536
2989 index = math.floor(seed / 65536 * len(source) )
2990 mixed.append(source[int(index)])
2991 source.remove(source[int(index)])
2992 #return ''.join(mixed)
2995 def _get_file_id(self, fileId, seed):
2996 mixed = self._get_file_ID_mix_string(seed)
2997 ids = fileId.split('*')
3001 realId.append(mixed[int(ch)])
3002 return ''.join(realId)
3004 def _real_extract(self, url):
3005 mobj = re.match(self._VALID_URL, url)
3007 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3009 video_id = mobj.group('ID')
3011 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3013 request = urllib2.Request(info_url, None, std_headers)
3015 self.report_download_webpage(video_id)
3016 jsondata = urllib2.urlopen(request).read()
3017 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3018 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3021 self.report_extraction(video_id)
3023 config = json.loads(jsondata)
3025 video_title = config['data'][0]['title']
3026 seed = config['data'][0]['seed']
3028 format = self._downloader.params.get('format', None)
3029 supported_format = config['data'][0]['streamfileids'].keys()
3031 if format is None or format == 'best':
3032 if 'hd2' in supported_format:
3037 elif format == 'worst':
3045 fileid = config['data'][0]['streamfileids'][format]
3046 seg_number = len(config['data'][0]['segs'][format])
3049 for i in xrange(seg_number):
3050 keys.append(config['data'][0]['segs'][format][i]['k'])
3053 #youku only could be viewed from mainland china
3055 self._downloader.trouble(u'ERROR: unable to extract info section')
3059 sid = self._gen_sid()
3060 fileid = self._get_file_id(fileid, seed)
3062 #column 8,9 of fileid represent the segment number
3063 #fileid[7:9] should be changed
3064 for index, key in enumerate(keys):
3066 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3067 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3070 'id': '%s_part%02d' % (video_id, index),
3071 'url': download_url,
3073 'title': video_title,
3077 files_info.append(info)
3082 class XNXXIE(InfoExtractor):
3083 """Information extractor for xnxx.com"""
3085 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3087 VIDEO_URL_RE = r'flv_url=(.*?)&'
3088 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3089 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3091 def report_webpage(self, video_id):
3092 """Report information extraction"""
3093 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3095 def report_extraction(self, video_id):
3096 """Report information extraction"""
3097 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3099 def _real_extract(self, url):
3100 mobj = re.match(self._VALID_URL, url)
3102 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3104 video_id = mobj.group(1).decode('utf-8')
3106 self.report_webpage(video_id)
3108 # Get webpage content
3110 webpage = urllib2.urlopen(url).read()
3111 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3112 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3115 result = re.search(self.VIDEO_URL_RE, webpage)
3117 self._downloader.trouble(u'ERROR: unable to extract video url')
3119 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3121 result = re.search(self.VIDEO_TITLE_RE, webpage)
3123 self._downloader.trouble(u'ERROR: unable to extract video title')
3125 video_title = result.group(1).decode('utf-8')
3127 result = re.search(self.VIDEO_THUMB_RE, webpage)
3129 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3131 video_thumbnail = result.group(1).decode('utf-8')
3133 info = {'id': video_id,
3136 'upload_date': None,
3137 'title': video_title,
3140 'thumbnail': video_thumbnail,
3141 'description': None,