2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
42 uploader: Nickname of the video uploader.
44 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
67 self.set_downloader(downloader)
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
74 """Initializes an instance (authentication, etc)."""
76 self._real_initialize()
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
82 return self._real_extract(url)
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
102 (?:https?://)? # http(s):// (optional)
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
105 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
106 (?: # the various things that can precede the ID:
107 (?:(?:v|embed|e)/) # v/ or embed/ or e/
108 |(?: # or the v= param in all its forms
109 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
110 (?:\?|\#!?) # the params delimiter ? or # or #!
111 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
114 )? # optional -> youtube.com/xxxx is OK
115 )? # all until now is optional -> you can pass the naked ID
116 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
117 (?(1).+)? # if we found the ID, everything can follow
119 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
120 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
121 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
122 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
123 _NETRC_MACHINE = 'youtube'
124 # Listed in order of quality
125 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
126 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
127 _video_extensions = {
133 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
139 _video_dimensions = {
157 def suitable(self, url):
158 """Receives a URL and returns True if suitable for this IE."""
159 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
161 def report_lang(self):
162 """Report attempt to set language."""
163 self._downloader.to_screen(u'[youtube] Setting language')
165 def report_login(self):
166 """Report attempt to log in."""
167 self._downloader.to_screen(u'[youtube] Logging in')
169 def report_age_confirmation(self):
170 """Report attempt to confirm age."""
171 self._downloader.to_screen(u'[youtube] Confirming age')
173 def report_video_webpage_download(self, video_id):
174 """Report attempt to download video webpage."""
175 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
177 def report_video_info_webpage_download(self, video_id):
178 """Report attempt to download video info webpage."""
179 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
181 def report_video_subtitles_download(self, video_id):
182 """Report attempt to download video info webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
185 def report_information_extraction(self, video_id):
186 """Report attempt to extract video information."""
187 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
189 def report_unavailable_format(self, video_id, format):
190 """Report extracted video URL."""
191 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
193 def report_rtmp_download(self):
194 """Indicate the download will use the RTMP protocol."""
195 self._downloader.to_screen(u'[youtube] RTMP download detected')
197 def _closed_captions_xml_to_srt(self, xml_string):
199 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
200 # TODO parse xml instead of regex
201 for n, (start, dur_tag, dur, caption) in enumerate(texts):
202 if not dur: dur = '4'
204 end = start + float(dur)
205 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
206 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
207 caption = unescapeHTML(caption)
208 caption = unescapeHTML(caption) # double cycle, intentional
209 srt += str(n+1) + '\n'
210 srt += start + ' --> ' + end + '\n'
211 srt += caption + '\n\n'
214 def _print_formats(self, formats):
215 print 'Available formats:'
217 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
219 def _real_initialize(self):
220 if self._downloader is None:
225 downloader_params = self._downloader.params
227 # Attempt to use provided username and password or .netrc data
228 if downloader_params.get('username', None) is not None:
229 username = downloader_params['username']
230 password = downloader_params['password']
231 elif downloader_params.get('usenetrc', False):
233 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
238 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
239 except (IOError, netrc.NetrcParseError), err:
240 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
244 request = urllib2.Request(self._LANG_URL)
247 urllib2.urlopen(request).read()
248 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
249 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
252 # No authentication to be performed
258 'current_form': 'loginForm',
260 'action_login': 'Log In',
261 'username': username,
262 'password': password,
264 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
267 login_results = urllib2.urlopen(request).read()
268 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
269 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
272 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
278 'action_confirm': 'Confirm',
280 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
282 self.report_age_confirmation()
283 age_results = urllib2.urlopen(request).read()
284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
285 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
288 def _real_extract(self, url):
289 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
290 mobj = re.search(self._NEXT_URL_RE, url)
292 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
294 # Extract video id from URL
295 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
297 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
299 video_id = mobj.group(2)
302 self.report_video_webpage_download(video_id)
303 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
305 video_webpage = urllib2.urlopen(request).read()
306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
307 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
310 # Attempt to extract SWF player URL
311 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
313 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
318 self.report_video_info_webpage_download(video_id)
319 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
320 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
321 % (video_id, el_type))
322 request = urllib2.Request(video_info_url)
324 video_info_webpage = urllib2.urlopen(request).read()
325 video_info = parse_qs(video_info_webpage)
326 if 'token' in video_info:
328 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
329 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
331 if 'token' not in video_info:
332 if 'reason' in video_info:
333 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
335 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
338 # Check for "rental" videos
339 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
340 self._downloader.trouble(u'ERROR: "rental" videos not supported')
343 # Start extracting information
344 self.report_information_extraction(video_id)
347 if 'author' not in video_info:
348 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
350 video_uploader = urllib.unquote_plus(video_info['author'][0])
353 if 'title' not in video_info:
354 self._downloader.trouble(u'ERROR: unable to extract video title')
356 video_title = urllib.unquote_plus(video_info['title'][0])
357 video_title = video_title.decode('utf-8')
360 if 'thumbnail_url' not in video_info:
361 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
363 else: # don't panic if we can't find it
364 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
368 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
370 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
371 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
372 for expression in format_expressions:
374 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
379 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
380 if video_description: video_description = clean_html(video_description)
381 else: video_description = ''
384 video_subtitles = None
385 if self._downloader.params.get('writesubtitles', False):
387 self.report_video_subtitles_download(video_id)
388 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
390 srt_list = urllib2.urlopen(request).read()
391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
392 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
393 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
394 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
395 if not srt_lang_list:
396 raise Trouble(u'WARNING: video has no closed captions')
397 if self._downloader.params.get('subtitleslang', False):
398 srt_lang = self._downloader.params.get('subtitleslang')
399 elif 'en' in srt_lang_list:
402 srt_lang = srt_lang_list.keys()[0]
403 if not srt_lang in srt_lang_list:
404 raise Trouble(u'WARNING: no closed captions found in the specified language')
405 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
407 srt_xml = urllib2.urlopen(request).read()
408 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
409 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
411 raise Trouble(u'WARNING: unable to download video subtitles')
412 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
413 except Trouble as trouble:
414 self._downloader.trouble(trouble[0])
417 video_token = urllib.unquote_plus(video_info['token'][0])
419 # Decide which formats to download
420 req_format = self._downloader.params.get('format', None)
422 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
423 self.report_rtmp_download()
424 video_url_list = [(None, video_info['conn'][0])]
425 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
426 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
427 url_data = [parse_qs(uds) for uds in url_data_strs]
428 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
429 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
431 format_limit = self._downloader.params.get('format_limit', None)
432 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
433 if format_limit is not None and format_limit in available_formats:
434 format_list = available_formats[available_formats.index(format_limit):]
436 format_list = available_formats
437 existing_formats = [x for x in format_list if x in url_map]
438 if len(existing_formats) == 0:
439 self._downloader.trouble(u'ERROR: no known formats available for video')
441 if self._downloader.params.get('listformats', None):
442 self._print_formats(existing_formats)
444 if req_format is None or req_format == 'best':
445 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
446 elif req_format == 'worst':
447 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
448 elif req_format in ('-1', 'all'):
449 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
451 # Specific formats. We pick the first in a slash-delimeted sequence.
452 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
453 req_formats = req_format.split('/')
454 video_url_list = None
455 for rf in req_formats:
457 video_url_list = [(rf, url_map[rf])]
459 if video_url_list is None:
460 self._downloader.trouble(u'ERROR: requested format not available')
463 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
467 for format_param, video_real_url in video_url_list:
469 video_extension = self._video_extensions.get(format_param, 'flv')
472 'id': video_id.decode('utf-8'),
473 'url': video_real_url.decode('utf-8'),
474 'uploader': video_uploader.decode('utf-8'),
475 'upload_date': upload_date,
476 'title': video_title,
477 'ext': video_extension.decode('utf-8'),
478 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
479 'thumbnail': video_thumbnail.decode('utf-8'),
480 'description': video_description,
481 'player_url': player_url,
482 'subtitles': video_subtitles
487 class MetacafeIE(InfoExtractor):
488 """Information Extractor for metacafe.com."""
490 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
491 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
492 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
493 IE_NAME = u'metacafe'
495 def __init__(self, downloader=None):
496 InfoExtractor.__init__(self, downloader)
498 def report_disclaimer(self):
499 """Report disclaimer retrieval."""
500 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
502 def report_age_confirmation(self):
503 """Report attempt to confirm age."""
504 self._downloader.to_screen(u'[metacafe] Confirming age')
506 def report_download_webpage(self, video_id):
507 """Report webpage download."""
508 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
510 def report_extraction(self, video_id):
511 """Report information extraction."""
512 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
514 def _real_initialize(self):
515 # Retrieve disclaimer
516 request = urllib2.Request(self._DISCLAIMER)
518 self.report_disclaimer()
519 disclaimer = urllib2.urlopen(request).read()
520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
521 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
527 'submit': "Continue - I'm over 18",
529 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
531 self.report_age_confirmation()
532 disclaimer = urllib2.urlopen(request).read()
533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
537 def _real_extract(self, url):
538 # Extract id and simplified title from URL
539 mobj = re.match(self._VALID_URL, url)
541 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
544 video_id = mobj.group(1)
546 # Check if video comes from YouTube
547 mobj2 = re.match(r'^yt-(.*)$', video_id)
548 if mobj2 is not None:
549 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
552 # Retrieve video webpage to extract further information
553 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
555 self.report_download_webpage(video_id)
556 webpage = urllib2.urlopen(request).read()
557 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
558 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
561 # Extract URL, uploader and title from webpage
562 self.report_extraction(video_id)
563 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
565 mediaURL = urllib.unquote(mobj.group(1))
566 video_extension = mediaURL[-3:]
568 # Extract gdaKey if available
569 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
573 gdaKey = mobj.group(1)
574 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
576 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
578 self._downloader.trouble(u'ERROR: unable to extract media URL')
580 vardict = parse_qs(mobj.group(1))
581 if 'mediaData' not in vardict:
582 self._downloader.trouble(u'ERROR: unable to extract media URL')
584 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
588 mediaURL = mobj.group(1).replace('\\/', '/')
589 video_extension = mediaURL[-3:]
590 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
592 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
594 self._downloader.trouble(u'ERROR: unable to extract title')
596 video_title = mobj.group(1).decode('utf-8')
598 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
600 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
602 video_uploader = mobj.group(1)
605 'id': video_id.decode('utf-8'),
606 'url': video_url.decode('utf-8'),
607 'uploader': video_uploader.decode('utf-8'),
608 'upload_date': u'NA',
609 'title': video_title,
610 'ext': video_extension.decode('utf-8'),
616 class DailymotionIE(InfoExtractor):
617 """Information Extractor for Dailymotion"""
619 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
620 IE_NAME = u'dailymotion'
622 def __init__(self, downloader=None):
623 InfoExtractor.__init__(self, downloader)
625 def report_download_webpage(self, video_id):
626 """Report webpage download."""
627 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
629 def report_extraction(self, video_id):
630 """Report information extraction."""
631 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
633 def _real_extract(self, url):
634 # Extract id and simplified title from URL
635 mobj = re.match(self._VALID_URL, url)
637 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
640 video_id = mobj.group(1).split('_')[0].split('?')[0]
642 video_extension = 'mp4'
644 # Retrieve video webpage to extract further information
645 request = urllib2.Request(url)
646 request.add_header('Cookie', 'family_filter=off')
648 self.report_download_webpage(video_id)
649 webpage = urllib2.urlopen(request).read()
650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
651 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
654 # Extract URL, uploader and title from webpage
655 self.report_extraction(video_id)
656 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
658 self._downloader.trouble(u'ERROR: unable to extract media URL')
660 flashvars = urllib.unquote(mobj.group(1))
662 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
665 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
668 self._downloader.trouble(u'ERROR: unable to extract video URL')
671 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
673 self._downloader.trouble(u'ERROR: unable to extract video URL')
676 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
678 # TODO: support choosing qualities
680 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
682 self._downloader.trouble(u'ERROR: unable to extract title')
684 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
686 video_uploader = u'NA'
687 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
689 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
691 video_uploader = mobj.group(1)
693 video_upload_date = u'NA'
694 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
696 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
699 'id': video_id.decode('utf-8'),
700 'url': video_url.decode('utf-8'),
701 'uploader': video_uploader.decode('utf-8'),
702 'upload_date': video_upload_date,
703 'title': video_title,
704 'ext': video_extension.decode('utf-8'),
710 class GoogleIE(InfoExtractor):
711 """Information extractor for video.google.com."""
713 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
714 IE_NAME = u'video.google'
716 def __init__(self, downloader=None):
717 InfoExtractor.__init__(self, downloader)
719 def report_download_webpage(self, video_id):
720 """Report webpage download."""
721 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
723 def report_extraction(self, video_id):
724 """Report information extraction."""
725 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
727 def _real_extract(self, url):
728 # Extract id from URL
729 mobj = re.match(self._VALID_URL, url)
731 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
734 video_id = mobj.group(1)
736 video_extension = 'mp4'
738 # Retrieve video webpage to extract further information
739 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
741 self.report_download_webpage(video_id)
742 webpage = urllib2.urlopen(request).read()
743 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
744 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
747 # Extract URL, uploader, and title from webpage
748 self.report_extraction(video_id)
749 mobj = re.search(r"download_url:'([^']+)'", webpage)
751 video_extension = 'flv'
752 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
754 self._downloader.trouble(u'ERROR: unable to extract media URL')
756 mediaURL = urllib.unquote(mobj.group(1))
757 mediaURL = mediaURL.replace('\\x3d', '\x3d')
758 mediaURL = mediaURL.replace('\\x26', '\x26')
762 mobj = re.search(r'<title>(.*)</title>', webpage)
764 self._downloader.trouble(u'ERROR: unable to extract title')
766 video_title = mobj.group(1).decode('utf-8')
768 # Extract video description
769 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
771 self._downloader.trouble(u'ERROR: unable to extract video description')
773 video_description = mobj.group(1).decode('utf-8')
774 if not video_description:
775 video_description = 'No description available.'
777 # Extract video thumbnail
778 if self._downloader.params.get('forcethumbnail', False):
779 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
781 webpage = urllib2.urlopen(request).read()
782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
783 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
785 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
787 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
789 video_thumbnail = mobj.group(1)
790 else: # we need something to pass to process_info
794 'id': video_id.decode('utf-8'),
795 'url': video_url.decode('utf-8'),
797 'upload_date': u'NA',
798 'title': video_title,
799 'ext': video_extension.decode('utf-8'),
805 class PhotobucketIE(InfoExtractor):
806 """Information extractor for photobucket.com."""
808 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
809 IE_NAME = u'photobucket'
811 def __init__(self, downloader=None):
812 InfoExtractor.__init__(self, downloader)
814 def report_download_webpage(self, video_id):
815 """Report webpage download."""
816 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
818 def report_extraction(self, video_id):
819 """Report information extraction."""
820 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
822 def _real_extract(self, url):
823 # Extract id from URL
824 mobj = re.match(self._VALID_URL, url)
826 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
829 video_id = mobj.group(1)
831 video_extension = 'flv'
833 # Retrieve video webpage to extract further information
834 request = urllib2.Request(url)
836 self.report_download_webpage(video_id)
837 webpage = urllib2.urlopen(request).read()
838 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
839 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
842 # Extract URL, uploader, and title from webpage
843 self.report_extraction(video_id)
844 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
846 self._downloader.trouble(u'ERROR: unable to extract media URL')
848 mediaURL = urllib.unquote(mobj.group(1))
852 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
854 self._downloader.trouble(u'ERROR: unable to extract title')
856 video_title = mobj.group(1).decode('utf-8')
858 video_uploader = mobj.group(2).decode('utf-8')
861 'id': video_id.decode('utf-8'),
862 'url': video_url.decode('utf-8'),
863 'uploader': video_uploader,
864 'upload_date': u'NA',
865 'title': video_title,
866 'ext': video_extension.decode('utf-8'),
872 class YahooIE(InfoExtractor):
873 """Information extractor for video.yahoo.com."""
875 # _VALID_URL matches all Yahoo! Video URLs
876 # _VPAGE_URL matches only the extractable '/watch/' URLs
877 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
878 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
879 IE_NAME = u'video.yahoo'
881 def __init__(self, downloader=None):
882 InfoExtractor.__init__(self, downloader)
884 def report_download_webpage(self, video_id):
885 """Report webpage download."""
886 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
888 def report_extraction(self, video_id):
889 """Report information extraction."""
890 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
892 def _real_extract(self, url, new_video=True):
893 # Extract ID from URL
894 mobj = re.match(self._VALID_URL, url)
896 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
899 video_id = mobj.group(2)
900 video_extension = 'flv'
902 # Rewrite valid but non-extractable URLs as
903 # extractable English language /watch/ URLs
904 if re.match(self._VPAGE_URL, url) is None:
905 request = urllib2.Request(url)
907 webpage = urllib2.urlopen(request).read()
908 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
909 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
912 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
914 self._downloader.trouble(u'ERROR: Unable to extract id field')
916 yahoo_id = mobj.group(1)
918 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
920 self._downloader.trouble(u'ERROR: Unable to extract vid field')
922 yahoo_vid = mobj.group(1)
924 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
925 return self._real_extract(url, new_video=False)
927 # Retrieve video webpage to extract further information
928 request = urllib2.Request(url)
930 self.report_download_webpage(video_id)
931 webpage = urllib2.urlopen(request).read()
932 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
933 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
936 # Extract uploader and title from webpage
937 self.report_extraction(video_id)
938 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
940 self._downloader.trouble(u'ERROR: unable to extract video title')
942 video_title = mobj.group(1).decode('utf-8')
944 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
946 self._downloader.trouble(u'ERROR: unable to extract video uploader')
948 video_uploader = mobj.group(1).decode('utf-8')
950 # Extract video thumbnail
951 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
953 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
955 video_thumbnail = mobj.group(1).decode('utf-8')
957 # Extract video description
958 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
960 self._downloader.trouble(u'ERROR: unable to extract video description')
962 video_description = mobj.group(1).decode('utf-8')
963 if not video_description:
964 video_description = 'No description available.'
966 # Extract video height and width
967 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
969 self._downloader.trouble(u'ERROR: unable to extract video height')
971 yv_video_height = mobj.group(1)
973 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
975 self._downloader.trouble(u'ERROR: unable to extract video width')
977 yv_video_width = mobj.group(1)
979 # Retrieve video playlist to extract media URL
980 # I'm not completely sure what all these options are, but we
981 # seem to need most of them, otherwise the server sends a 401.
982 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
983 yv_bitrate = '700' # according to Wikipedia this is hard-coded
984 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
985 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
986 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
988 self.report_download_webpage(video_id)
989 webpage = urllib2.urlopen(request).read()
990 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
991 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
994 # Extract media URL from playlist XML
995 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
997 self._downloader.trouble(u'ERROR: Unable to extract media URL')
999 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1000 video_url = unescapeHTML(video_url)
1003 'id': video_id.decode('utf-8'),
1005 'uploader': video_uploader,
1006 'upload_date': u'NA',
1007 'title': video_title,
1008 'ext': video_extension.decode('utf-8'),
1009 'thumbnail': video_thumbnail.decode('utf-8'),
1010 'description': video_description,
1011 'thumbnail': video_thumbnail,
1016 class VimeoIE(InfoExtractor):
1017 """Information extractor for vimeo.com."""
1019 # _VALID_URL matches Vimeo URLs
1020 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1023 def __init__(self, downloader=None):
1024 InfoExtractor.__init__(self, downloader)
1026 def report_download_webpage(self, video_id):
1027 """Report webpage download."""
1028 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030 def report_extraction(self, video_id):
1031 """Report information extraction."""
1032 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034 def _real_extract(self, url, new_video=True):
1035 # Extract ID from URL
1036 mobj = re.match(self._VALID_URL, url)
1038 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1041 video_id = mobj.group(1)
1043 # Retrieve video webpage to extract further information
1044 request = urllib2.Request(url, None, std_headers)
1046 self.report_download_webpage(video_id)
1047 webpage = urllib2.urlopen(request).read()
1048 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1049 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1052 # Now we begin extracting as much information as we can from what we
1053 # retrieved. First we extract the information common to all extractors,
1054 # and latter we extract those that are Vimeo specific.
1055 self.report_extraction(video_id)
1057 # Extract the config JSON
1058 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1060 config = json.loads(config)
1062 self._downloader.trouble(u'ERROR: unable to extract info section')
1066 video_title = config["video"]["title"]
1069 video_uploader = config["video"]["owner"]["name"]
1071 # Extract video thumbnail
1072 video_thumbnail = config["video"]["thumbnail"]
1074 # Extract video description
1075 video_description = get_element_by_id("description", webpage.decode('utf8'))
1076 if video_description: video_description = clean_html(video_description)
1077 else: video_description = ''
1079 # Extract upload date
1080 video_upload_date = u'NA'
1081 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1082 if mobj is not None:
1083 video_upload_date = mobj.group(1)
1085 # Vimeo specific: extract request signature and timestamp
1086 sig = config['request']['signature']
1087 timestamp = config['request']['timestamp']
1089 # Vimeo specific: extract video codec and quality information
1090 # TODO bind to format param
1091 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1092 for codec in codecs:
1093 if codec[0] in config["video"]["files"]:
1094 video_codec = codec[0]
1095 video_extension = codec[1]
1096 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1097 else: quality = 'sd'
1100 self._downloader.trouble(u'ERROR: no known codec found')
1103 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1104 %(video_id, sig, timestamp, quality, video_codec.upper())
1109 'uploader': video_uploader,
1110 'upload_date': video_upload_date,
1111 'title': video_title,
1112 'ext': video_extension,
1113 'thumbnail': video_thumbnail,
1114 'description': video_description,
1119 class GenericIE(InfoExtractor):
1120 """Generic last-resort information extractor."""
1123 IE_NAME = u'generic'
1125 def __init__(self, downloader=None):
1126 InfoExtractor.__init__(self, downloader)
1128 def report_download_webpage(self, video_id):
1129 """Report webpage download."""
1130 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1131 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1133 def report_extraction(self, video_id):
1134 """Report information extraction."""
1135 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1137 def report_following_redirect(self, new_url):
1138 """Report information extraction."""
1139 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1141 def _test_redirect(self, url):
1142 """Check if it is a redirect, like url shorteners, in case restart chain."""
1143 class HeadRequest(urllib2.Request):
1144 def get_method(self):
1147 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1149 Subclass the HTTPRedirectHandler to make it use our
1150 HeadRequest also on the redirected URL
1152 def redirect_request(self, req, fp, code, msg, headers, newurl):
1153 if code in (301, 302, 303, 307):
1154 newurl = newurl.replace(' ', '%20')
1155 newheaders = dict((k,v) for k,v in req.headers.items()
1156 if k.lower() not in ("content-length", "content-type"))
1157 return HeadRequest(newurl,
1159 origin_req_host=req.get_origin_req_host(),
1162 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1164 class HTTPMethodFallback(urllib2.BaseHandler):
1166 Fallback to GET if HEAD is not allowed (405 HTTP error)
1168 def http_error_405(self, req, fp, code, msg, headers):
1172 newheaders = dict((k,v) for k,v in req.headers.items()
1173 if k.lower() not in ("content-length", "content-type"))
1174 return self.parent.open(urllib2.Request(req.get_full_url(),
1176 origin_req_host=req.get_origin_req_host(),
1180 opener = urllib2.OpenerDirector()
1181 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1182 HTTPMethodFallback, HEADRedirectHandler,
1183 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1184 opener.add_handler(handler())
1186 response = opener.open(HeadRequest(url))
1187 new_url = response.geturl()
1189 if url == new_url: return False
1191 self.report_following_redirect(new_url)
1192 self._downloader.download([new_url])
1195 def _real_extract(self, url):
1196 if self._test_redirect(url): return
1198 video_id = url.split('/')[-1]
1199 request = urllib2.Request(url)
1201 self.report_download_webpage(video_id)
1202 webpage = urllib2.urlopen(request).read()
1203 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1204 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1206 except ValueError, err:
1207 # since this is the last-resort InfoExtractor, if
1208 # this error is thrown, it'll be thrown here
1209 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1212 self.report_extraction(video_id)
1213 # Start with something easy: JW Player in SWFObject
1214 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1216 # Broaden the search a little bit
1217 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1219 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1222 # It's possible that one of the regexes
1223 # matched, but returned an empty group:
1224 if mobj.group(1) is None:
1225 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1228 video_url = urllib.unquote(mobj.group(1))
1229 video_id = os.path.basename(video_url)
1231 # here's a fun little line of code for you:
1232 video_extension = os.path.splitext(video_id)[1][1:]
1233 video_id = os.path.splitext(video_id)[0]
1235 # it's tempting to parse this further, but you would
1236 # have to take into account all the variations like
1237 # Video Title - Site Name
1238 # Site Name | Video Title
1239 # Video Title - Tagline | Site Name
1240 # and so on and so forth; it's just not practical
1241 mobj = re.search(r'<title>(.*)</title>', webpage)
1243 self._downloader.trouble(u'ERROR: unable to extract title')
1245 video_title = mobj.group(1).decode('utf-8')
1247 # video uploader is domain name
1248 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1250 self._downloader.trouble(u'ERROR: unable to extract title')
1252 video_uploader = mobj.group(1).decode('utf-8')
1255 'id': video_id.decode('utf-8'),
1256 'url': video_url.decode('utf-8'),
1257 'uploader': video_uploader,
1258 'upload_date': u'NA',
1259 'title': video_title,
1260 'ext': video_extension.decode('utf-8'),
1266 class YoutubeSearchIE(InfoExtractor):
1267 """Information Extractor for YouTube search queries."""
1268 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1269 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1270 _max_youtube_results = 1000
1271 IE_NAME = u'youtube:search'
1273 def __init__(self, downloader=None):
1274 InfoExtractor.__init__(self, downloader)
1276 def report_download_page(self, query, pagenum):
1277 """Report attempt to download search page with given number."""
1278 query = query.decode(preferredencoding())
1279 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1281 def _real_extract(self, query):
1282 mobj = re.match(self._VALID_URL, query)
1284 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1287 prefix, query = query.split(':')
1289 query = query.encode('utf-8')
1291 self._download_n_results(query, 1)
1293 elif prefix == 'all':
1294 self._download_n_results(query, self._max_youtube_results)
1300 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1302 elif n > self._max_youtube_results:
1303 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1304 n = self._max_youtube_results
1305 self._download_n_results(query, n)
1307 except ValueError: # parsing prefix as integer fails
1308 self._download_n_results(query, 1)
1311 def _download_n_results(self, query, n):
1312 """Downloads a specified number of results for a query"""
1318 while (50 * pagenum) < limit:
1319 self.report_download_page(query, pagenum+1)
1320 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1321 request = urllib2.Request(result_url)
1323 data = urllib2.urlopen(request).read()
1324 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1325 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1327 api_response = json.loads(data)['data']
1329 new_ids = list(video['id'] for video in api_response['items'])
1330 video_ids += new_ids
1332 limit = min(n, api_response['totalItems'])
1335 if len(video_ids) > n:
1336 video_ids = video_ids[:n]
1337 for id in video_ids:
1338 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1342 class GoogleSearchIE(InfoExtractor):
1343 """Information Extractor for Google Video search queries."""
1344 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1345 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1346 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1347 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1348 _max_google_results = 1000
1349 IE_NAME = u'video.google:search'
1351 def __init__(self, downloader=None):
1352 InfoExtractor.__init__(self, downloader)
1354 def report_download_page(self, query, pagenum):
1355 """Report attempt to download playlist page with given number."""
1356 query = query.decode(preferredencoding())
1357 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1359 def _real_extract(self, query):
1360 mobj = re.match(self._VALID_URL, query)
1362 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1365 prefix, query = query.split(':')
1367 query = query.encode('utf-8')
1369 self._download_n_results(query, 1)
1371 elif prefix == 'all':
1372 self._download_n_results(query, self._max_google_results)
1378 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1380 elif n > self._max_google_results:
1381 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1382 n = self._max_google_results
1383 self._download_n_results(query, n)
1385 except ValueError: # parsing prefix as integer fails
1386 self._download_n_results(query, 1)
1389 def _download_n_results(self, query, n):
1390 """Downloads a specified number of results for a query"""
1396 self.report_download_page(query, pagenum)
1397 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1398 request = urllib2.Request(result_url)
1400 page = urllib2.urlopen(request).read()
1401 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1402 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1405 # Extract video identifiers
1406 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1407 video_id = mobj.group(1)
1408 if video_id not in video_ids:
1409 video_ids.append(video_id)
1410 if len(video_ids) == n:
1411 # Specified n videos reached
1412 for id in video_ids:
1413 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1416 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1417 for id in video_ids:
1418 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1421 pagenum = pagenum + 1
1424 class YahooSearchIE(InfoExtractor):
1425 """Information Extractor for Yahoo! Video search queries."""
1426 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1427 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1428 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1429 _MORE_PAGES_INDICATOR = r'\s*Next'
1430 _max_yahoo_results = 1000
1431 IE_NAME = u'video.yahoo:search'
1433 def __init__(self, downloader=None):
1434 InfoExtractor.__init__(self, downloader)
1436 def report_download_page(self, query, pagenum):
1437 """Report attempt to download playlist page with given number."""
1438 query = query.decode(preferredencoding())
1439 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1441 def _real_extract(self, query):
1442 mobj = re.match(self._VALID_URL, query)
1444 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1447 prefix, query = query.split(':')
1449 query = query.encode('utf-8')
1451 self._download_n_results(query, 1)
1453 elif prefix == 'all':
1454 self._download_n_results(query, self._max_yahoo_results)
1460 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1462 elif n > self._max_yahoo_results:
1463 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1464 n = self._max_yahoo_results
1465 self._download_n_results(query, n)
1467 except ValueError: # parsing prefix as integer fails
1468 self._download_n_results(query, 1)
1471 def _download_n_results(self, query, n):
1472 """Downloads a specified number of results for a query"""
1475 already_seen = set()
1479 self.report_download_page(query, pagenum)
1480 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1481 request = urllib2.Request(result_url)
1483 page = urllib2.urlopen(request).read()
1484 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1485 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1488 # Extract video identifiers
1489 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1490 video_id = mobj.group(1)
1491 if video_id not in already_seen:
1492 video_ids.append(video_id)
1493 already_seen.add(video_id)
1494 if len(video_ids) == n:
1495 # Specified n videos reached
1496 for id in video_ids:
1497 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1500 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1501 for id in video_ids:
1502 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1505 pagenum = pagenum + 1
1508 class YoutubePlaylistIE(InfoExtractor):
1509 """Information Extractor for YouTube playlists."""
1511 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1512 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1513 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1514 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1515 IE_NAME = u'youtube:playlist'
1517 def __init__(self, downloader=None):
1518 InfoExtractor.__init__(self, downloader)
1520 def report_download_page(self, playlist_id, pagenum):
1521 """Report attempt to download playlist page with given number."""
1522 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1524 def _real_extract(self, url):
1525 # Extract playlist id
1526 mobj = re.match(self._VALID_URL, url)
1528 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1532 if mobj.group(3) is not None:
1533 self._downloader.download([mobj.group(3)])
1536 # Download playlist pages
1537 # prefix is 'p' as default for playlists but there are other types that need extra care
1538 playlist_prefix = mobj.group(1)
1539 if playlist_prefix == 'a':
1540 playlist_access = 'artist'
1542 playlist_prefix = 'p'
1543 playlist_access = 'view_play_list'
1544 playlist_id = mobj.group(2)
1549 self.report_download_page(playlist_id, pagenum)
1550 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1551 request = urllib2.Request(url)
1553 page = urllib2.urlopen(request).read()
1554 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1555 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1558 # Extract video identifiers
1560 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1561 if mobj.group(1) not in ids_in_page:
1562 ids_in_page.append(mobj.group(1))
1563 video_ids.extend(ids_in_page)
1565 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1567 pagenum = pagenum + 1
1569 playliststart = self._downloader.params.get('playliststart', 1) - 1
1570 playlistend = self._downloader.params.get('playlistend', -1)
1571 if playlistend == -1:
1572 video_ids = video_ids[playliststart:]
1574 video_ids = video_ids[playliststart:playlistend]
1576 for id in video_ids:
1577 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1581 class YoutubeChannelIE(InfoExtractor):
1582 """Information Extractor for YouTube channels."""
1584 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1585 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1586 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1587 IE_NAME = u'youtube:channel'
1589 def report_download_page(self, channel_id, pagenum):
1590 """Report attempt to download channel page with given number."""
1591 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1593 def _real_extract(self, url):
1594 # Extract channel id
1595 mobj = re.match(self._VALID_URL, url)
1597 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1600 # Download channel pages
1601 channel_id = mobj.group(1)
1606 self.report_download_page(channel_id, pagenum)
1607 url = self._TEMPLATE_URL % (channel_id, pagenum)
1608 request = urllib2.Request(url)
1610 page = urllib2.urlopen(request).read()
1611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1615 # Extract video identifiers
1617 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1618 if mobj.group(1) not in ids_in_page:
1619 ids_in_page.append(mobj.group(1))
1620 video_ids.extend(ids_in_page)
1622 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1624 pagenum = pagenum + 1
1626 for id in video_ids:
1627 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1631 class YoutubeUserIE(InfoExtractor):
1632 """Information Extractor for YouTube users."""
1634 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1635 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1636 _GDATA_PAGE_SIZE = 50
1637 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1638 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1639 IE_NAME = u'youtube:user'
1641 def __init__(self, downloader=None):
1642 InfoExtractor.__init__(self, downloader)
1644 def report_download_page(self, username, start_index):
1645 """Report attempt to download user page."""
1646 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1647 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1649 def _real_extract(self, url):
1651 mobj = re.match(self._VALID_URL, url)
1653 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1656 username = mobj.group(1)
1658 # Download video ids using YouTube Data API. Result size per
1659 # query is limited (currently to 50 videos) so we need to query
1660 # page by page until there are no video ids - it means we got
1667 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1668 self.report_download_page(username, start_index)
1670 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1673 page = urllib2.urlopen(request).read()
1674 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1675 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1678 # Extract video identifiers
1681 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1682 if mobj.group(1) not in ids_in_page:
1683 ids_in_page.append(mobj.group(1))
1685 video_ids.extend(ids_in_page)
1687 # A little optimization - if current page is not
1688 # "full", ie. does not contain PAGE_SIZE video ids then
1689 # we can assume that this page is the last one - there
1690 # are no more ids on further pages - no need to query
1693 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1698 all_ids_count = len(video_ids)
1699 playliststart = self._downloader.params.get('playliststart', 1) - 1
1700 playlistend = self._downloader.params.get('playlistend', -1)
1702 if playlistend == -1:
1703 video_ids = video_ids[playliststart:]
1705 video_ids = video_ids[playliststart:playlistend]
1707 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1708 (username, all_ids_count, len(video_ids)))
1710 for video_id in video_ids:
1711 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1714 class BlipTVUserIE(InfoExtractor):
1715 """Information Extractor for blip.tv users."""
1717 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1719 IE_NAME = u'blip.tv:user'
1721 def __init__(self, downloader=None):
1722 InfoExtractor.__init__(self, downloader)
1724 def report_download_page(self, username, pagenum):
1725 """Report attempt to download user page."""
1726 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1727 (self.IE_NAME, username, pagenum))
1729 def _real_extract(self, url):
1731 mobj = re.match(self._VALID_URL, url)
1733 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1736 username = mobj.group(1)
1738 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1740 request = urllib2.Request(url)
1743 page = urllib2.urlopen(request).read().decode('utf-8')
1744 mobj = re.search(r'data-users-id="([^"]+)"', page)
1745 page_base = page_base % mobj.group(1)
1746 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1747 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1751 # Download video ids using BlipTV Ajax calls. Result size per
1752 # query is limited (currently to 12 videos) so we need to query
1753 # page by page until there are no video ids - it means we got
1760 self.report_download_page(username, pagenum)
1762 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1765 page = urllib2.urlopen(request).read().decode('utf-8')
1766 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1767 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1770 # Extract video identifiers
1773 for mobj in re.finditer(r'href="/([^"]+)"', page):
1774 if mobj.group(1) not in ids_in_page:
1775 ids_in_page.append(unescapeHTML(mobj.group(1)))
1777 video_ids.extend(ids_in_page)
1779 # A little optimization - if current page is not
1780 # "full", ie. does not contain PAGE_SIZE video ids then
1781 # we can assume that this page is the last one - there
1782 # are no more ids on further pages - no need to query
1785 if len(ids_in_page) < self._PAGE_SIZE:
1790 all_ids_count = len(video_ids)
1791 playliststart = self._downloader.params.get('playliststart', 1) - 1
1792 playlistend = self._downloader.params.get('playlistend', -1)
1794 if playlistend == -1:
1795 video_ids = video_ids[playliststart:]
1797 video_ids = video_ids[playliststart:playlistend]
1799 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1800 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1802 for video_id in video_ids:
1803 self._downloader.download([u'http://blip.tv/'+video_id])
1806 class DepositFilesIE(InfoExtractor):
1807 """Information extractor for depositfiles.com"""
1809 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1810 IE_NAME = u'DepositFiles'
1812 def __init__(self, downloader=None):
1813 InfoExtractor.__init__(self, downloader)
1815 def report_download_webpage(self, file_id):
1816 """Report webpage download."""
1817 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1819 def report_extraction(self, file_id):
1820 """Report information extraction."""
1821 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1823 def _real_extract(self, url):
1824 file_id = url.split('/')[-1]
1825 # Rebuild url in english locale
1826 url = 'http://depositfiles.com/en/files/' + file_id
1828 # Retrieve file webpage with 'Free download' button pressed
1829 free_download_indication = { 'gateway_result' : '1' }
1830 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1832 self.report_download_webpage(file_id)
1833 webpage = urllib2.urlopen(request).read()
1834 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1835 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1838 # Search for the real file URL
1839 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1840 if (mobj is None) or (mobj.group(1) is None):
1841 # Try to figure out reason of the error.
1842 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1843 if (mobj is not None) and (mobj.group(1) is not None):
1844 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1845 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1847 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1850 file_url = mobj.group(1)
1851 file_extension = os.path.splitext(file_url)[1][1:]
1853 # Search for file title
1854 mobj = re.search(r'<b title="(.*?)">', webpage)
1856 self._downloader.trouble(u'ERROR: unable to extract title')
1858 file_title = mobj.group(1).decode('utf-8')
1861 'id': file_id.decode('utf-8'),
1862 'url': file_url.decode('utf-8'),
1864 'upload_date': u'NA',
1865 'title': file_title,
1866 'ext': file_extension.decode('utf-8'),
1872 class FacebookIE(InfoExtractor):
1873 """Information Extractor for Facebook"""
1875 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1876 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1877 _NETRC_MACHINE = 'facebook'
1878 _available_formats = ['video', 'highqual', 'lowqual']
1879 _video_extensions = {
1884 IE_NAME = u'facebook'
1886 def __init__(self, downloader=None):
1887 InfoExtractor.__init__(self, downloader)
1889 def _reporter(self, message):
1890 """Add header and report message."""
1891 self._downloader.to_screen(u'[facebook] %s' % message)
1893 def report_login(self):
1894 """Report attempt to log in."""
1895 self._reporter(u'Logging in')
1897 def report_video_webpage_download(self, video_id):
1898 """Report attempt to download video webpage."""
1899 self._reporter(u'%s: Downloading video webpage' % video_id)
1901 def report_information_extraction(self, video_id):
1902 """Report attempt to extract video information."""
1903 self._reporter(u'%s: Extracting video information' % video_id)
1905 def _parse_page(self, video_webpage):
1906 """Extract video information from page"""
1908 data = {'title': r'\("video_title", "(.*?)"\)',
1909 'description': r'<div class="datawrap">(.*?)</div>',
1910 'owner': r'\("video_owner_name", "(.*?)"\)',
1911 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1914 for piece in data.keys():
1915 mobj = re.search(data[piece], video_webpage)
1916 if mobj is not None:
1917 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1921 for fmt in self._available_formats:
1922 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1923 if mobj is not None:
1924 # URL is in a Javascript segment inside an escaped Unicode format within
1925 # the generally utf-8 page
1926 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1927 video_info['video_urls'] = video_urls
1931 def _real_initialize(self):
1932 if self._downloader is None:
1937 downloader_params = self._downloader.params
1939 # Attempt to use provided username and password or .netrc data
1940 if downloader_params.get('username', None) is not None:
1941 useremail = downloader_params['username']
1942 password = downloader_params['password']
1943 elif downloader_params.get('usenetrc', False):
1945 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1946 if info is not None:
1950 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1951 except (IOError, netrc.NetrcParseError), err:
1952 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1955 if useremail is None:
1964 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1967 login_results = urllib2.urlopen(request).read()
1968 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1969 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1971 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1972 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1975 def _real_extract(self, url):
1976 mobj = re.match(self._VALID_URL, url)
1978 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1980 video_id = mobj.group('ID')
1983 self.report_video_webpage_download(video_id)
1984 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1986 page = urllib2.urlopen(request)
1987 video_webpage = page.read()
1988 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1989 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1992 # Start extracting information
1993 self.report_information_extraction(video_id)
1995 # Extract information
1996 video_info = self._parse_page(video_webpage)
1999 if 'owner' not in video_info:
2000 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2002 video_uploader = video_info['owner']
2005 if 'title' not in video_info:
2006 self._downloader.trouble(u'ERROR: unable to extract video title')
2008 video_title = video_info['title']
2009 video_title = video_title.decode('utf-8')
2012 if 'thumbnail' not in video_info:
2013 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2014 video_thumbnail = ''
2016 video_thumbnail = video_info['thumbnail']
2020 if 'upload_date' in video_info:
2021 upload_time = video_info['upload_date']
2022 timetuple = email.utils.parsedate_tz(upload_time)
2023 if timetuple is not None:
2025 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2030 video_description = video_info.get('description', 'No description available.')
2032 url_map = video_info['video_urls']
2033 if len(url_map.keys()) > 0:
2034 # Decide which formats to download
2035 req_format = self._downloader.params.get('format', None)
2036 format_limit = self._downloader.params.get('format_limit', None)
2038 if format_limit is not None and format_limit in self._available_formats:
2039 format_list = self._available_formats[self._available_formats.index(format_limit):]
2041 format_list = self._available_formats
2042 existing_formats = [x for x in format_list if x in url_map]
2043 if len(existing_formats) == 0:
2044 self._downloader.trouble(u'ERROR: no known formats available for video')
2046 if req_format is None:
2047 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2048 elif req_format == 'worst':
2049 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2050 elif req_format == '-1':
2051 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2054 if req_format not in url_map:
2055 self._downloader.trouble(u'ERROR: requested format not available')
2057 video_url_list = [(req_format, url_map[req_format])] # Specific format
2060 for format_param, video_real_url in video_url_list:
2062 video_extension = self._video_extensions.get(format_param, 'mp4')
2065 'id': video_id.decode('utf-8'),
2066 'url': video_real_url.decode('utf-8'),
2067 'uploader': video_uploader.decode('utf-8'),
2068 'upload_date': upload_date,
2069 'title': video_title,
2070 'ext': video_extension.decode('utf-8'),
2071 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2072 'thumbnail': video_thumbnail.decode('utf-8'),
2073 'description': video_description.decode('utf-8'),
2078 class BlipTVIE(InfoExtractor):
2079 """Information extractor for blip.tv"""
2081 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2082 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2083 IE_NAME = u'blip.tv'
2085 def report_extraction(self, file_id):
2086 """Report information extraction."""
2087 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2089 def report_direct_download(self, title):
2090 """Report information extraction."""
2091 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2093 def _real_extract(self, url):
2094 mobj = re.match(self._VALID_URL, url)
2096 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2103 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2104 request = urllib2.Request(json_url.encode('utf-8'))
2105 self.report_extraction(mobj.group(1))
2108 urlh = urllib2.urlopen(request)
2109 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2110 basename = url.split('/')[-1]
2111 title,ext = os.path.splitext(basename)
2112 title = title.decode('UTF-8')
2113 ext = ext.replace('.', '')
2114 self.report_direct_download(title)
2122 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2123 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2125 if info is None: # Regular URL
2127 json_code = urlh.read()
2128 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2129 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2133 json_data = json.loads(json_code)
2134 if 'Post' in json_data:
2135 data = json_data['Post']
2139 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2140 video_url = data['media']['url']
2141 umobj = re.match(self._URL_EXT, video_url)
2143 raise ValueError('Can not determine filename extension')
2144 ext = umobj.group(1)
2147 'id': data['item_id'],
2149 'uploader': data['display_name'],
2150 'upload_date': upload_date,
2151 'title': data['title'],
2153 'format': data['media']['mimeType'],
2154 'thumbnail': data['thumbnailUrl'],
2155 'description': data['description'],
2156 'player_url': data['embedUrl']
2158 except (ValueError,KeyError), err:
2159 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2162 std_headers['User-Agent'] = 'iTunes/10.6.1'
2166 class MyVideoIE(InfoExtractor):
2167 """Information Extractor for myvideo.de."""
2169 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2170 IE_NAME = u'myvideo'
2172 def __init__(self, downloader=None):
2173 InfoExtractor.__init__(self, downloader)
2175 def report_download_webpage(self, video_id):
2176 """Report webpage download."""
2177 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2179 def report_extraction(self, video_id):
2180 """Report information extraction."""
2181 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2183 def _real_extract(self,url):
2184 mobj = re.match(self._VALID_URL, url)
2186 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2189 video_id = mobj.group(1)
2192 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2194 self.report_download_webpage(video_id)
2195 webpage = urllib2.urlopen(request).read()
2196 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2197 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2200 self.report_extraction(video_id)
2201 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2204 self._downloader.trouble(u'ERROR: unable to extract media URL')
2206 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2208 mobj = re.search('<title>([^<]+)</title>', webpage)
2210 self._downloader.trouble(u'ERROR: unable to extract title')
2213 video_title = mobj.group(1)
2219 'upload_date': u'NA',
2220 'title': video_title,
2226 class ComedyCentralIE(InfoExtractor):
2227 """Information extractor for The Daily Show and Colbert Report """
2229 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2230 IE_NAME = u'comedycentral'
2232 def report_extraction(self, episode_id):
2233 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2235 def report_config_download(self, episode_id):
2236 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2238 def report_index_download(self, episode_id):
2239 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2241 def report_player_url(self, episode_id):
2242 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2244 def _real_extract(self, url):
2245 mobj = re.match(self._VALID_URL, url)
2247 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2250 if mobj.group('shortname'):
2251 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2252 url = u'http://www.thedailyshow.com/full-episodes/'
2254 url = u'http://www.colbertnation.com/full-episodes/'
2255 mobj = re.match(self._VALID_URL, url)
2256 assert mobj is not None
2258 dlNewest = not mobj.group('episode')
2260 epTitle = mobj.group('showname')
2262 epTitle = mobj.group('episode')
2264 req = urllib2.Request(url)
2265 self.report_extraction(epTitle)
2267 htmlHandle = urllib2.urlopen(req)
2268 html = htmlHandle.read()
2269 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2270 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2273 url = htmlHandle.geturl()
2274 mobj = re.match(self._VALID_URL, url)
2276 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2278 if mobj.group('episode') == '':
2279 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2281 epTitle = mobj.group('episode')
2283 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2284 if len(mMovieParams) == 0:
2285 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2288 playerUrl_raw = mMovieParams[0][0]
2289 self.report_player_url(epTitle)
2291 urlHandle = urllib2.urlopen(playerUrl_raw)
2292 playerUrl = urlHandle.geturl()
2293 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2294 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2297 uri = mMovieParams[0][1]
2298 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2299 self.report_index_download(epTitle)
2301 indexXml = urllib2.urlopen(indexUrl).read()
2302 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2303 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2308 idoc = xml.etree.ElementTree.fromstring(indexXml)
2309 itemEls = idoc.findall('.//item')
2310 for itemEl in itemEls:
2311 mediaId = itemEl.findall('./guid')[0].text
2312 shortMediaId = mediaId.split(':')[-1]
2313 showId = mediaId.split(':')[-2].replace('.com', '')
2314 officialTitle = itemEl.findall('./title')[0].text
2315 officialDate = itemEl.findall('./pubDate')[0].text
2317 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2318 urllib.urlencode({'uri': mediaId}))
2319 configReq = urllib2.Request(configUrl)
2320 self.report_config_download(epTitle)
2322 configXml = urllib2.urlopen(configReq).read()
2323 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2324 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2327 cdoc = xml.etree.ElementTree.fromstring(configXml)
2329 for rendition in cdoc.findall('.//rendition'):
2330 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2334 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2337 # For now, just pick the highest bitrate
2338 format,video_url = turls[-1]
2340 effTitle = showId + u'-' + epTitle
2345 'upload_date': officialDate,
2350 'description': officialTitle,
2351 'player_url': playerUrl
2354 results.append(info)
2359 class EscapistIE(InfoExtractor):
2360 """Information extractor for The Escapist """
2362 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2363 IE_NAME = u'escapist'
2365 def report_extraction(self, showName):
2366 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2368 def report_config_download(self, showName):
2369 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2371 def _real_extract(self, url):
2372 mobj = re.match(self._VALID_URL, url)
2374 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2376 showName = mobj.group('showname')
2377 videoId = mobj.group('episode')
2379 self.report_extraction(showName)
2381 webPage = urllib2.urlopen(url)
2382 webPageBytes = webPage.read()
2383 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2384 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2385 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2386 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2389 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2390 description = unescapeHTML(descMatch.group(1))
2391 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2392 imgUrl = unescapeHTML(imgMatch.group(1))
2393 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2394 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2395 configUrlMatch = re.search('config=(.*)$', playerUrl)
2396 configUrl = urllib2.unquote(configUrlMatch.group(1))
2398 self.report_config_download(showName)
2400 configJSON = urllib2.urlopen(configUrl).read()
2401 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2402 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2405 # Technically, it's JavaScript, not JSON
2406 configJSON = configJSON.replace("'", '"')
2409 config = json.loads(configJSON)
2410 except (ValueError,), err:
2411 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2414 playlist = config['playlist']
2415 videoUrl = playlist[1]['url']
2420 'uploader': showName,
2421 'upload_date': None,
2425 'thumbnail': imgUrl,
2426 'description': description,
2427 'player_url': playerUrl,
2433 class CollegeHumorIE(InfoExtractor):
2434 """Information extractor for collegehumor.com"""
2436 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2437 IE_NAME = u'collegehumor'
2439 def report_webpage(self, video_id):
2440 """Report information extraction."""
2441 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2443 def report_extraction(self, video_id):
2444 """Report information extraction."""
2445 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2447 def _real_extract(self, url):
2448 mobj = re.match(self._VALID_URL, url)
2450 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2452 video_id = mobj.group('videoid')
2454 self.report_webpage(video_id)
2455 request = urllib2.Request(url)
2457 webpage = urllib2.urlopen(request).read()
2458 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2459 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2462 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2464 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2466 internal_video_id = m.group('internalvideoid')
2470 'internal_id': internal_video_id,
2473 self.report_extraction(video_id)
2474 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2476 metaXml = urllib2.urlopen(xmlUrl).read()
2477 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2478 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2481 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2483 videoNode = mdoc.findall('./video')[0]
2484 info['description'] = videoNode.findall('./description')[0].text
2485 info['title'] = videoNode.findall('./caption')[0].text
2486 info['url'] = videoNode.findall('./file')[0].text
2487 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2488 info['ext'] = info['url'].rpartition('.')[2]
2489 info['format'] = info['ext']
2491 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2497 class XVideosIE(InfoExtractor):
2498 """Information extractor for xvideos.com"""
2500 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2501 IE_NAME = u'xvideos'
2503 def report_webpage(self, video_id):
2504 """Report information extraction."""
2505 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2507 def report_extraction(self, video_id):
2508 """Report information extraction."""
2509 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2511 def _real_extract(self, url):
2512 mobj = re.match(self._VALID_URL, url)
2514 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2516 video_id = mobj.group(1).decode('utf-8')
2518 self.report_webpage(video_id)
2520 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2522 webpage = urllib2.urlopen(request).read()
2523 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2524 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2527 self.report_extraction(video_id)
2531 mobj = re.search(r'flv_url=(.+?)&', webpage)
2533 self._downloader.trouble(u'ERROR: unable to extract video url')
2535 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2539 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2541 self._downloader.trouble(u'ERROR: unable to extract video title')
2543 video_title = mobj.group(1).decode('utf-8')
2546 # Extract video thumbnail
2547 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2549 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2551 video_thumbnail = mobj.group(0).decode('utf-8')
2557 'upload_date': None,
2558 'title': video_title,
2561 'thumbnail': video_thumbnail,
2562 'description': None,
2569 class SoundcloudIE(InfoExtractor):
2570 """Information extractor for soundcloud.com
2571 To access the media, the uid of the song and a stream token
2572 must be extracted from the page source and the script must make
2573 a request to media.soundcloud.com/crossdomain.xml. Then
2574 the media can be grabbed by requesting from an url composed
2575 of the stream token and uid
2578 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2579 IE_NAME = u'soundcloud'
2581 def __init__(self, downloader=None):
2582 InfoExtractor.__init__(self, downloader)
2584 def report_webpage(self, video_id):
2585 """Report information extraction."""
2586 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2588 def report_extraction(self, video_id):
2589 """Report information extraction."""
2590 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2592 def _real_extract(self, url):
2593 mobj = re.match(self._VALID_URL, url)
2595 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2598 # extract uploader (which is in the url)
2599 uploader = mobj.group(1).decode('utf-8')
2600 # extract simple title (uploader + slug of song title)
2601 slug_title = mobj.group(2).decode('utf-8')
2602 simple_title = uploader + u'-' + slug_title
2604 self.report_webpage('%s/%s' % (uploader, slug_title))
2606 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2608 webpage = urllib2.urlopen(request).read()
2609 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2610 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2613 self.report_extraction('%s/%s' % (uploader, slug_title))
2615 # extract uid and stream token that soundcloud hands out for access
2616 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2618 video_id = mobj.group(1)
2619 stream_token = mobj.group(2)
2621 # extract unsimplified title
2622 mobj = re.search('"title":"(.*?)",', webpage)
2624 title = mobj.group(1).decode('utf-8')
2626 title = simple_title
2628 # construct media url (with uid/token)
2629 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2630 mediaURL = mediaURL % (video_id, stream_token)
2633 description = u'No description available'
2634 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2636 description = mobj.group(1)
2640 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2643 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2644 except Exception, e:
2645 self._downloader.to_stderr(str(e))
2647 # for soundcloud, a request to a cross domain is required for cookies
2648 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2651 'id': video_id.decode('utf-8'),
2653 'uploader': uploader.decode('utf-8'),
2654 'upload_date': upload_date,
2659 'description': description.decode('utf-8')
2663 class InfoQIE(InfoExtractor):
2664 """Information extractor for infoq.com"""
2666 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2669 def report_webpage(self, video_id):
2670 """Report information extraction."""
2671 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2673 def report_extraction(self, video_id):
2674 """Report information extraction."""
2675 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2677 def _real_extract(self, url):
2678 mobj = re.match(self._VALID_URL, url)
2680 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2683 self.report_webpage(url)
2685 request = urllib2.Request(url)
2687 webpage = urllib2.urlopen(request).read()
2688 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2689 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2692 self.report_extraction(url)
2696 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2698 self._downloader.trouble(u'ERROR: unable to extract video url')
2700 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2704 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2706 self._downloader.trouble(u'ERROR: unable to extract video title')
2708 video_title = mobj.group(1).decode('utf-8')
2710 # Extract description
2711 video_description = u'No description available.'
2712 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2713 if mobj is not None:
2714 video_description = mobj.group(1).decode('utf-8')
2716 video_filename = video_url.split('/')[-1]
2717 video_id, extension = video_filename.split('.')
2723 'upload_date': None,
2724 'title': video_title,
2726 'format': extension, # Extension is always(?) mp4, but seems to be flv
2728 'description': video_description,
2734 class MixcloudIE(InfoExtractor):
2735 """Information extractor for www.mixcloud.com"""
2736 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2737 IE_NAME = u'mixcloud'
2739 def __init__(self, downloader=None):
2740 InfoExtractor.__init__(self, downloader)
2742 def report_download_json(self, file_id):
2743 """Report JSON download."""
2744 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2746 def report_extraction(self, file_id):
2747 """Report information extraction."""
2748 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2750 def get_urls(self, jsonData, fmt, bitrate='best'):
2751 """Get urls from 'audio_formats' section in json"""
2754 bitrate_list = jsonData[fmt]
2755 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2756 bitrate = max(bitrate_list) # select highest
2758 url_list = jsonData[fmt][bitrate]
2759 except TypeError: # we have no bitrate info.
2760 url_list = jsonData[fmt]
2763 def check_urls(self, url_list):
2764 """Returns 1st active url from list"""
2765 for url in url_list:
2767 urllib2.urlopen(url)
2769 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2774 def _print_formats(self, formats):
2775 print 'Available formats:'
2776 for fmt in formats.keys():
2777 for b in formats[fmt]:
2779 ext = formats[fmt][b][0]
2780 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2781 except TypeError: # we have no bitrate info
2782 ext = formats[fmt][0]
2783 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2786 def _real_extract(self, url):
2787 mobj = re.match(self._VALID_URL, url)
2789 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2791 # extract uploader & filename from url
2792 uploader = mobj.group(1).decode('utf-8')
2793 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2795 # construct API request
2796 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2797 # retrieve .json file with links to files
2798 request = urllib2.Request(file_url)
2800 self.report_download_json(file_url)
2801 jsonData = urllib2.urlopen(request).read()
2802 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2803 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2807 json_data = json.loads(jsonData)
2808 player_url = json_data['player_swf_url']
2809 formats = dict(json_data['audio_formats'])
2811 req_format = self._downloader.params.get('format', None)
2814 if self._downloader.params.get('listformats', None):
2815 self._print_formats(formats)
2818 if req_format is None or req_format == 'best':
2819 for format_param in formats.keys():
2820 url_list = self.get_urls(formats, format_param)
2822 file_url = self.check_urls(url_list)
2823 if file_url is not None:
2826 if req_format not in formats.keys():
2827 self._downloader.trouble(u'ERROR: format is not available')
2830 url_list = self.get_urls(formats, req_format)
2831 file_url = self.check_urls(url_list)
2832 format_param = req_format
2835 'id': file_id.decode('utf-8'),
2836 'url': file_url.decode('utf-8'),
2837 'uploader': uploader.decode('utf-8'),
2838 'upload_date': u'NA',
2839 'title': json_data['name'],
2840 'ext': file_url.split('.')[-1].decode('utf-8'),
2841 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2842 'thumbnail': json_data['thumbnail_url'],
2843 'description': json_data['description'],
2844 'player_url': player_url.decode('utf-8'),
2847 class StanfordOpenClassroomIE(InfoExtractor):
2848 """Information extractor for Stanford's Open ClassRoom"""
2850 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2851 IE_NAME = u'stanfordoc'
2853 def report_download_webpage(self, objid):
2854 """Report information extraction."""
2855 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2857 def report_extraction(self, video_id):
2858 """Report information extraction."""
2859 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2861 def _real_extract(self, url):
2862 mobj = re.match(self._VALID_URL, url)
2864 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2867 if mobj.group('course') and mobj.group('video'): # A specific video
2868 course = mobj.group('course')
2869 video = mobj.group('video')
2871 'id': course + '_' + video,
2874 self.report_extraction(info['id'])
2875 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2876 xmlUrl = baseUrl + video + '.xml'
2878 metaXml = urllib2.urlopen(xmlUrl).read()
2879 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2880 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2882 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2884 info['title'] = mdoc.findall('./title')[0].text
2885 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2887 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2889 info['ext'] = info['url'].rpartition('.')[2]
2890 info['format'] = info['ext']
2892 elif mobj.group('course'): # A course page
2893 course = mobj.group('course')
2899 self.report_download_webpage(info['id'])
2901 coursepage = urllib2.urlopen(url).read()
2902 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2903 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2906 m = re.search('<h1>([^<]+)</h1>', coursepage)
2908 info['title'] = unescapeHTML(m.group(1))
2910 info['title'] = info['id']
2912 m = re.search('<description>([^<]+)</description>', coursepage)
2914 info['description'] = unescapeHTML(m.group(1))
2916 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2919 'type': 'reference',
2920 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2924 for entry in info['list']:
2925 assert entry['type'] == 'reference'
2926 results += self.extract(entry['url'])
2931 'id': 'Stanford OpenClassroom',
2935 self.report_download_webpage(info['id'])
2936 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2938 rootpage = urllib2.urlopen(rootURL).read()
2939 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2940 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2943 info['title'] = info['id']
2945 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2948 'type': 'reference',
2949 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2954 for entry in info['list']:
2955 assert entry['type'] == 'reference'
2956 results += self.extract(entry['url'])
2959 class MTVIE(InfoExtractor):
2960 """Information extractor for MTV.com"""
2962 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2965 def report_webpage(self, video_id):
2966 """Report information extraction."""
2967 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2969 def report_extraction(self, video_id):
2970 """Report information extraction."""
2971 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2973 def _real_extract(self, url):
2974 mobj = re.match(self._VALID_URL, url)
2976 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2978 if not mobj.group('proto'):
2979 url = 'http://' + url
2980 video_id = mobj.group('videoid')
2981 self.report_webpage(video_id)
2983 request = urllib2.Request(url)
2985 webpage = urllib2.urlopen(request).read()
2986 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2987 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2990 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2992 self._downloader.trouble(u'ERROR: unable to extract song name')
2994 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2995 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2997 self._downloader.trouble(u'ERROR: unable to extract performer')
2999 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3000 video_title = performer + ' - ' + song_name
3002 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3004 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3006 mtvn_uri = mobj.group(1)
3008 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3010 self._downloader.trouble(u'ERROR: unable to extract content id')
3012 content_id = mobj.group(1)
3014 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3015 self.report_extraction(video_id)
3016 request = urllib2.Request(videogen_url)
3018 metadataXml = urllib2.urlopen(request).read()
3019 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3020 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3023 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3024 renditions = mdoc.findall('.//rendition')
3026 # For now, always pick the highest quality.
3027 rendition = renditions[-1]
3030 _,_,ext = rendition.attrib['type'].partition('/')
3031 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3032 video_url = rendition.find('./src').text
3034 self._downloader.trouble('Invalid rendition field.')
3040 'uploader': performer,
3041 'title': video_title,
3049 class YoukuIE(InfoExtractor):
3051 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3054 def __init__(self, downloader=None):
3055 InfoExtractor.__init__(self, downloader)
3057 def report_download_webpage(self, file_id):
3058 """Report webpage download."""
3059 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3061 def report_extraction(self, file_id):
3062 """Report information extraction."""
3063 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3066 nowTime = int(time.time() * 1000)
3067 random1 = random.randint(1000,1998)
3068 random2 = random.randint(1000,9999)
3070 return "%d%d%d" %(nowTime,random1,random2)
3072 def _get_file_ID_mix_string(self, seed):
3074 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3076 for i in range(len(source)):
3077 seed = (seed * 211 + 30031 ) % 65536
3078 index = math.floor(seed / 65536 * len(source) )
3079 mixed.append(source[int(index)])
3080 source.remove(source[int(index)])
3081 #return ''.join(mixed)
3084 def _get_file_id(self, fileId, seed):
3085 mixed = self._get_file_ID_mix_string(seed)
3086 ids = fileId.split('*')
3090 realId.append(mixed[int(ch)])
3091 return ''.join(realId)
3093 def _real_extract(self, url):
3094 mobj = re.match(self._VALID_URL, url)
3096 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3098 video_id = mobj.group('ID')
3100 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3102 request = urllib2.Request(info_url, None, std_headers)
3104 self.report_download_webpage(video_id)
3105 jsondata = urllib2.urlopen(request).read()
3106 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3107 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3110 self.report_extraction(video_id)
3112 config = json.loads(jsondata)
3114 video_title = config['data'][0]['title']
3115 seed = config['data'][0]['seed']
3117 format = self._downloader.params.get('format', None)
3118 supported_format = config['data'][0]['streamfileids'].keys()
3120 if format is None or format == 'best':
3121 if 'hd2' in supported_format:
3126 elif format == 'worst':
3134 fileid = config['data'][0]['streamfileids'][format]
3135 seg_number = len(config['data'][0]['segs'][format])
3138 for i in xrange(seg_number):
3139 keys.append(config['data'][0]['segs'][format][i]['k'])
3142 #youku only could be viewed from mainland china
3144 self._downloader.trouble(u'ERROR: unable to extract info section')
3148 sid = self._gen_sid()
3149 fileid = self._get_file_id(fileid, seed)
3151 #column 8,9 of fileid represent the segment number
3152 #fileid[7:9] should be changed
3153 for index, key in enumerate(keys):
3155 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3156 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3159 'id': '%s_part%02d' % (video_id, index),
3160 'url': download_url,
3162 'title': video_title,
3166 files_info.append(info)
3171 class XNXXIE(InfoExtractor):
3172 """Information extractor for xnxx.com"""
3174 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3176 VIDEO_URL_RE = r'flv_url=(.*?)&'
3177 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3178 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3180 def report_webpage(self, video_id):
3181 """Report information extraction"""
3182 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3184 def report_extraction(self, video_id):
3185 """Report information extraction"""
3186 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3188 def _real_extract(self, url):
3189 mobj = re.match(self._VALID_URL, url)
3191 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3193 video_id = mobj.group(1).decode('utf-8')
3195 self.report_webpage(video_id)
3197 # Get webpage content
3199 webpage = urllib2.urlopen(url).read()
3200 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3201 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3204 result = re.search(self.VIDEO_URL_RE, webpage)
3206 self._downloader.trouble(u'ERROR: unable to extract video url')
3208 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3210 result = re.search(self.VIDEO_TITLE_RE, webpage)
3212 self._downloader.trouble(u'ERROR: unable to extract video title')
3214 video_title = result.group(1).decode('utf-8')
3216 result = re.search(self.VIDEO_THUMB_RE, webpage)
3218 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3220 video_thumbnail = result.group(1).decode('utf-8')
3222 info = {'id': video_id,
3225 'upload_date': None,
3226 'title': video_title,
3229 'thumbnail': video_thumbnail,
3230 'description': None,
3236 class GooglePlusIE(InfoExtractor):
3237 """Information extractor for plus.google.com."""
3239 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3240 IE_NAME = u'plus.google'
3242 def __init__(self, downloader=None):
3243 InfoExtractor.__init__(self, downloader)
3245 def report_extract_entry(self, url):
3246 """Report downloading extry"""
3247 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3249 def report_date(self, upload_date):
3250 """Report downloading extry"""
3251 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3253 def report_uploader(self, uploader):
3254 """Report downloading extry"""
3255 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3257 def report_title(self, video_title):
3258 """Report downloading extry"""
3259 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3261 def report_extract_vid_page(self, video_page):
3262 """Report information extraction."""
3263 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3265 def _real_extract(self, url):
3266 # Extract id from URL
3267 mobj = re.match(self._VALID_URL, url)
3269 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3272 post_url = mobj.group(0)
3273 video_id = mobj.group(2)
3275 video_extension = 'flv'
3277 # Step 1, Retrieve post webpage to extract further information
3278 self.report_extract_entry(post_url)
3279 request = urllib2.Request(post_url)
3281 webpage = urllib2.urlopen(request).read()
3282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3283 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3286 # Extract update date
3288 pattern = 'title="Timestamp">(.*?)</a>'
3289 mobj = re.search(pattern, webpage)
3291 upload_date = mobj.group(1)
3292 # Convert timestring to a format suitable for filename
3293 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3294 upload_date = upload_date.strftime('%Y%m%d')
3295 self.report_date(upload_date)
3299 pattern = r'rel\="author".*?>(.*?)</a>'
3300 mobj = re.search(pattern, webpage)
3302 uploader = mobj.group(1)
3303 self.report_uploader(uploader)
3306 # Get the first line for title
3308 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3309 mobj = re.search(pattern, webpage)
3311 video_title = mobj.group(1)
3312 self.report_title(video_title)
3314 # Step 2, Stimulate clicking the image box to launch video
3315 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3316 mobj = re.search(pattern, webpage)
3318 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3320 video_page = mobj.group(1)
3321 request = urllib2.Request(video_page)
3323 webpage = urllib2.urlopen(request).read()
3324 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3325 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3327 self.report_extract_vid_page(video_page)
3330 # Extract video links on video page
3331 """Extract video links of all sizes"""
3332 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3333 mobj = re.findall(pattern, webpage)
3335 self._downloader.trouble(u'ERROR: unable to extract video links')
3337 # Sort in resolution
3338 links = sorted(mobj)
3340 # Choose the lowest of the sort, i.e. highest resolution
3341 video_url = links[-1]
3342 # Only get the url. The resolution part in the tuple has no use anymore
3343 video_url = video_url[-1]
3344 # Treat escaped \u0026 style hex
3345 video_url = unicode(video_url, "unicode_escape")
3349 'id': video_id.decode('utf-8'),
3351 'uploader': uploader.decode('utf-8'),
3352 'upload_date': upload_date.decode('utf-8'),
3353 'title': video_title.decode('utf-8'),
3354 'ext': video_extension.decode('utf-8'),