2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
39 The dictionaries must include the following fields:
43 uploader: Nickname of the video uploader, unescaped.
44 upload_date: Video upload date (YYYYMMDD).
45 title: Video title, unescaped.
46 ext: Video filename extension.
48 The following fields are optional:
50 format: The video format, defaults to ext (used for --get-format)
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The .srt file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib2.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Initializes an instance (authentication, etc)."""
83 self._real_initialize()
86 def extract(self, url):
87 """Extracts URL information and returns it in list of dicts."""
89 return self._real_extract(url)
91 def set_downloader(self, downloader):
92 """Sets the downloader for this IE."""
93 self._downloader = downloader
95 def _real_initialize(self):
96 """Real initialization process. Redefine in subclasses."""
99 def _real_extract(self, url):
100 """Real extraction process. Redefine in subclasses."""
104 class YoutubeIE(InfoExtractor):
105 """Information extractor for youtube.com."""
109 (?:https?://)? # http(s):// (optional)
110 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
112 (?:.*?\#/)? # handle anchor (#/) redirect urls
113 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
114 (?: # the various things that can precede the ID:
115 (?:(?:v|embed|e)/) # v/ or embed/ or e/
116 |(?: # or the v= param in all its forms
117 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118 (?:\?|\#!?) # the params delimiter ? or # or #!
119 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
122 )? # optional -> youtube.com/xxxx is OK
123 )? # all until now is optional -> you can pass the naked ID
124 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
125 (?(1).+)? # if we found the ID, everything can follow
127 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131 _NETRC_MACHINE = 'youtube'
132 # Listed in order of quality
133 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135 _video_extensions = {
141 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
147 _video_dimensions = {
165 def suitable(self, url):
166 """Receives a URL and returns True if suitable for this IE."""
167 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
169 def report_lang(self):
170 """Report attempt to set language."""
171 self._downloader.to_screen(u'[youtube] Setting language')
173 def report_login(self):
174 """Report attempt to log in."""
175 self._downloader.to_screen(u'[youtube] Logging in')
177 def report_age_confirmation(self):
178 """Report attempt to confirm age."""
179 self._downloader.to_screen(u'[youtube] Confirming age')
181 def report_video_webpage_download(self, video_id):
182 """Report attempt to download video webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
185 def report_video_info_webpage_download(self, video_id):
186 """Report attempt to download video info webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
189 def report_video_subtitles_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
193 def report_information_extraction(self, video_id):
194 """Report attempt to extract video information."""
195 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
197 def report_unavailable_format(self, video_id, format):
198 """Report extracted video URL."""
199 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
201 def report_rtmp_download(self):
202 """Indicate the download will use the RTMP protocol."""
203 self._downloader.to_screen(u'[youtube] RTMP download detected')
205 def _closed_captions_xml_to_srt(self, xml_string):
207 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208 # TODO parse xml instead of regex
209 for n, (start, dur_tag, dur, caption) in enumerate(texts):
210 if not dur: dur = '4'
212 end = start + float(dur)
213 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215 caption = unescapeHTML(caption)
216 caption = unescapeHTML(caption) # double cycle, intentional
217 srt += str(n+1) + '\n'
218 srt += start + ' --> ' + end + '\n'
219 srt += caption + '\n\n'
222 def _print_formats(self, formats):
223 print('Available formats:')
225 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
227 def _real_initialize(self):
228 if self._downloader is None:
233 downloader_params = self._downloader.params
235 # Attempt to use provided username and password or .netrc data
236 if downloader_params.get('username', None) is not None:
237 username = downloader_params['username']
238 password = downloader_params['password']
239 elif downloader_params.get('usenetrc', False):
241 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
246 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247 except (IOError, netrc.NetrcParseError), err:
248 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
252 request = urllib2.Request(self._LANG_URL)
255 urllib2.urlopen(request).read()
256 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
257 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
260 # No authentication to be performed
266 'current_form': 'loginForm',
268 'action_login': 'Log In',
269 'username': username,
270 'password': password,
272 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
275 login_results = urllib2.urlopen(request).read()
276 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
279 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
280 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
286 'action_confirm': 'Confirm',
288 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
290 self.report_age_confirmation()
291 age_results = urllib2.urlopen(request).read()
292 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
293 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
296 def _real_extract(self, url):
297 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298 mobj = re.search(self._NEXT_URL_RE, url)
300 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
302 # Extract video id from URL
303 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
305 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
307 video_id = mobj.group(2)
310 self.report_video_webpage_download(video_id)
311 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
313 video_webpage = urllib2.urlopen(request).read()
314 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
315 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
318 # Attempt to extract SWF player URL
319 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
321 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
326 self.report_video_info_webpage_download(video_id)
327 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
328 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
329 % (video_id, el_type))
330 request = urllib2.Request(video_info_url)
332 video_info_webpage = urllib2.urlopen(request).read()
333 video_info = parse_qs(video_info_webpage)
334 if 'token' in video_info:
336 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
337 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
339 if 'token' not in video_info:
340 if 'reason' in video_info:
341 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
343 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
346 # Check for "rental" videos
347 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
348 self._downloader.trouble(u'ERROR: "rental" videos not supported')
351 # Start extracting information
352 self.report_information_extraction(video_id)
355 if 'author' not in video_info:
356 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
358 video_uploader = urllib.unquote_plus(video_info['author'][0])
361 if 'title' not in video_info:
362 self._downloader.trouble(u'ERROR: unable to extract video title')
364 video_title = urllib.unquote_plus(video_info['title'][0])
365 video_title = video_title.decode('utf-8')
368 if 'thumbnail_url' not in video_info:
369 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
371 else: # don't panic if we can't find it
372 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
376 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
378 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
379 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
380 for expression in format_expressions:
382 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
387 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
388 if video_description: video_description = clean_html(video_description)
389 else: video_description = ''
392 video_subtitles = None
393 if self._downloader.params.get('writesubtitles', False):
395 self.report_video_subtitles_download(video_id)
396 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
398 srt_list = urllib2.urlopen(request).read()
399 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
400 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
401 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
402 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
403 if not srt_lang_list:
404 raise Trouble(u'WARNING: video has no closed captions')
405 if self._downloader.params.get('subtitleslang', False):
406 srt_lang = self._downloader.params.get('subtitleslang')
407 elif 'en' in srt_lang_list:
410 srt_lang = srt_lang_list.keys()[0]
411 if not srt_lang in srt_lang_list:
412 raise Trouble(u'WARNING: no closed captions found in the specified language')
413 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
415 srt_xml = urllib2.urlopen(request).read()
416 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
417 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
419 raise Trouble(u'WARNING: unable to download video subtitles')
420 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
421 except Trouble as trouble:
422 self._downloader.trouble(trouble[0])
424 if 'length_seconds' not in video_info:
425 self._downloader.trouble(u'WARNING: unable to extract video duration')
428 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
431 video_token = urllib.unquote_plus(video_info['token'][0])
433 # Decide which formats to download
434 req_format = self._downloader.params.get('format', None)
436 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
437 self.report_rtmp_download()
438 video_url_list = [(None, video_info['conn'][0])]
439 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
440 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
441 url_data = [parse_qs(uds) for uds in url_data_strs]
442 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
443 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
445 format_limit = self._downloader.params.get('format_limit', None)
446 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
447 if format_limit is not None and format_limit in available_formats:
448 format_list = available_formats[available_formats.index(format_limit):]
450 format_list = available_formats
451 existing_formats = [x for x in format_list if x in url_map]
452 if len(existing_formats) == 0:
453 self._downloader.trouble(u'ERROR: no known formats available for video')
455 if self._downloader.params.get('listformats', None):
456 self._print_formats(existing_formats)
458 if req_format is None or req_format == 'best':
459 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
460 elif req_format == 'worst':
461 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
462 elif req_format in ('-1', 'all'):
463 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
465 # Specific formats. We pick the first in a slash-delimeted sequence.
466 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
467 req_formats = req_format.split('/')
468 video_url_list = None
469 for rf in req_formats:
471 video_url_list = [(rf, url_map[rf])]
473 if video_url_list is None:
474 self._downloader.trouble(u'ERROR: requested format not available')
477 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
481 for format_param, video_real_url in video_url_list:
483 video_extension = self._video_extensions.get(format_param, 'flv')
485 video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
486 self._video_dimensions.get(format_param, '???'))
489 'id': video_id.decode('utf-8'),
490 'url': video_real_url.decode('utf-8'),
491 'uploader': video_uploader.decode('utf-8'),
492 'upload_date': upload_date,
493 'title': video_title,
494 'ext': video_extension.decode('utf-8'),
495 'format': video_format,
496 'thumbnail': video_thumbnail.decode('utf-8'),
497 'description': video_description,
498 'player_url': player_url,
499 'subtitles': video_subtitles,
500 'duration': video_duration
505 class MetacafeIE(InfoExtractor):
506 """Information Extractor for metacafe.com."""
508 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
509 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
510 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
511 IE_NAME = u'metacafe'
513 def __init__(self, downloader=None):
514 InfoExtractor.__init__(self, downloader)
516 def report_disclaimer(self):
517 """Report disclaimer retrieval."""
518 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
520 def report_age_confirmation(self):
521 """Report attempt to confirm age."""
522 self._downloader.to_screen(u'[metacafe] Confirming age')
524 def report_download_webpage(self, video_id):
525 """Report webpage download."""
526 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
528 def report_extraction(self, video_id):
529 """Report information extraction."""
530 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
532 def _real_initialize(self):
533 # Retrieve disclaimer
534 request = urllib2.Request(self._DISCLAIMER)
536 self.report_disclaimer()
537 disclaimer = urllib2.urlopen(request).read()
538 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
539 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
545 'submit': "Continue - I'm over 18",
547 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
549 self.report_age_confirmation()
550 disclaimer = urllib2.urlopen(request).read()
551 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
552 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
555 def _real_extract(self, url):
556 # Extract id and simplified title from URL
557 mobj = re.match(self._VALID_URL, url)
559 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
562 video_id = mobj.group(1)
564 # Check if video comes from YouTube
565 mobj2 = re.match(r'^yt-(.*)$', video_id)
566 if mobj2 is not None:
567 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
570 # Retrieve video webpage to extract further information
571 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
573 self.report_download_webpage(video_id)
574 webpage = urllib2.urlopen(request).read()
575 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
576 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
579 # Extract URL, uploader and title from webpage
580 self.report_extraction(video_id)
581 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
583 mediaURL = urllib.unquote(mobj.group(1))
584 video_extension = mediaURL[-3:]
586 # Extract gdaKey if available
587 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
591 gdaKey = mobj.group(1)
592 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
594 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
596 self._downloader.trouble(u'ERROR: unable to extract media URL')
598 vardict = parse_qs(mobj.group(1))
599 if 'mediaData' not in vardict:
600 self._downloader.trouble(u'ERROR: unable to extract media URL')
602 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
604 self._downloader.trouble(u'ERROR: unable to extract media URL')
606 mediaURL = mobj.group(1).replace('\\/', '/')
607 video_extension = mediaURL[-3:]
608 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
610 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
612 self._downloader.trouble(u'ERROR: unable to extract title')
614 video_title = mobj.group(1).decode('utf-8')
616 mobj = re.search(r'submitter=(.*?);', webpage)
618 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
620 video_uploader = mobj.group(1)
623 'id': video_id.decode('utf-8'),
624 'url': video_url.decode('utf-8'),
625 'uploader': video_uploader.decode('utf-8'),
626 'upload_date': u'NA',
627 'title': video_title,
628 'ext': video_extension.decode('utf-8'),
632 class DailymotionIE(InfoExtractor):
633 """Information Extractor for Dailymotion"""
635 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
636 IE_NAME = u'dailymotion'
638 def __init__(self, downloader=None):
639 InfoExtractor.__init__(self, downloader)
641 def report_download_webpage(self, video_id):
642 """Report webpage download."""
643 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
645 def report_extraction(self, video_id):
646 """Report information extraction."""
647 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
649 def _real_extract(self, url):
650 # Extract id and simplified title from URL
651 mobj = re.match(self._VALID_URL, url)
653 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
656 video_id = mobj.group(1).split('_')[0].split('?')[0]
658 video_extension = 'mp4'
660 # Retrieve video webpage to extract further information
661 request = urllib2.Request(url)
662 request.add_header('Cookie', 'family_filter=off')
664 self.report_download_webpage(video_id)
665 webpage = urllib2.urlopen(request).read()
666 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
667 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
670 # Extract URL, uploader and title from webpage
671 self.report_extraction(video_id)
672 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
674 self._downloader.trouble(u'ERROR: unable to extract media URL')
676 flashvars = urllib.unquote(mobj.group(1))
678 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
681 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
684 self._downloader.trouble(u'ERROR: unable to extract video URL')
687 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
689 self._downloader.trouble(u'ERROR: unable to extract video URL')
692 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
694 # TODO: support choosing qualities
696 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
698 self._downloader.trouble(u'ERROR: unable to extract title')
700 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
702 video_uploader = u'NA'
703 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
705 # lookin for official user
706 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
707 if mobj_official is None:
708 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
710 video_uploader = mobj_official.group(1)
712 video_uploader = mobj.group(1)
714 video_upload_date = u'NA'
715 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
717 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
720 'id': video_id.decode('utf-8'),
721 'url': video_url.decode('utf-8'),
722 'uploader': video_uploader.decode('utf-8'),
723 'upload_date': video_upload_date,
724 'title': video_title,
725 'ext': video_extension.decode('utf-8'),
729 class GoogleIE(InfoExtractor):
730 """Information extractor for video.google.com."""
732 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
733 IE_NAME = u'video.google'
735 def __init__(self, downloader=None):
736 InfoExtractor.__init__(self, downloader)
738 def report_download_webpage(self, video_id):
739 """Report webpage download."""
740 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
742 def report_extraction(self, video_id):
743 """Report information extraction."""
744 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
746 def _real_extract(self, url):
747 # Extract id from URL
748 mobj = re.match(self._VALID_URL, url)
750 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
753 video_id = mobj.group(1)
755 video_extension = 'mp4'
757 # Retrieve video webpage to extract further information
758 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
760 self.report_download_webpage(video_id)
761 webpage = urllib2.urlopen(request).read()
762 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
763 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
766 # Extract URL, uploader, and title from webpage
767 self.report_extraction(video_id)
768 mobj = re.search(r"download_url:'([^']+)'", webpage)
770 video_extension = 'flv'
771 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
773 self._downloader.trouble(u'ERROR: unable to extract media URL')
775 mediaURL = urllib.unquote(mobj.group(1))
776 mediaURL = mediaURL.replace('\\x3d', '\x3d')
777 mediaURL = mediaURL.replace('\\x26', '\x26')
781 mobj = re.search(r'<title>(.*)</title>', webpage)
783 self._downloader.trouble(u'ERROR: unable to extract title')
785 video_title = mobj.group(1).decode('utf-8')
787 # Extract video description
788 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
790 self._downloader.trouble(u'ERROR: unable to extract video description')
792 video_description = mobj.group(1).decode('utf-8')
793 if not video_description:
794 video_description = 'No description available.'
796 # Extract video thumbnail
797 if self._downloader.params.get('forcethumbnail', False):
798 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
800 webpage = urllib2.urlopen(request).read()
801 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
802 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
804 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
806 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
808 video_thumbnail = mobj.group(1)
809 else: # we need something to pass to process_info
813 'id': video_id.decode('utf-8'),
814 'url': video_url.decode('utf-8'),
816 'upload_date': u'NA',
817 'title': video_title,
818 'ext': video_extension.decode('utf-8'),
822 class PhotobucketIE(InfoExtractor):
823 """Information extractor for photobucket.com."""
825 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
826 IE_NAME = u'photobucket'
828 def __init__(self, downloader=None):
829 InfoExtractor.__init__(self, downloader)
831 def report_download_webpage(self, video_id):
832 """Report webpage download."""
833 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
835 def report_extraction(self, video_id):
836 """Report information extraction."""
837 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
839 def _real_extract(self, url):
840 # Extract id from URL
841 mobj = re.match(self._VALID_URL, url)
843 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
846 video_id = mobj.group(1)
848 video_extension = 'flv'
850 # Retrieve video webpage to extract further information
851 request = urllib2.Request(url)
853 self.report_download_webpage(video_id)
854 webpage = urllib2.urlopen(request).read()
855 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
856 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
859 # Extract URL, uploader, and title from webpage
860 self.report_extraction(video_id)
861 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
863 self._downloader.trouble(u'ERROR: unable to extract media URL')
865 mediaURL = urllib.unquote(mobj.group(1))
869 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
871 self._downloader.trouble(u'ERROR: unable to extract title')
873 video_title = mobj.group(1).decode('utf-8')
875 video_uploader = mobj.group(2).decode('utf-8')
878 'id': video_id.decode('utf-8'),
879 'url': video_url.decode('utf-8'),
880 'uploader': video_uploader,
881 'upload_date': u'NA',
882 'title': video_title,
883 'ext': video_extension.decode('utf-8'),
887 class YahooIE(InfoExtractor):
888 """Information extractor for video.yahoo.com."""
890 # _VALID_URL matches all Yahoo! Video URLs
891 # _VPAGE_URL matches only the extractable '/watch/' URLs
892 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
893 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
894 IE_NAME = u'video.yahoo'
896 def __init__(self, downloader=None):
897 InfoExtractor.__init__(self, downloader)
899 def report_download_webpage(self, video_id):
900 """Report webpage download."""
901 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
903 def report_extraction(self, video_id):
904 """Report information extraction."""
905 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
907 def _real_extract(self, url, new_video=True):
908 # Extract ID from URL
909 mobj = re.match(self._VALID_URL, url)
911 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
914 video_id = mobj.group(2)
915 video_extension = 'flv'
917 # Rewrite valid but non-extractable URLs as
918 # extractable English language /watch/ URLs
919 if re.match(self._VPAGE_URL, url) is None:
920 request = urllib2.Request(url)
922 webpage = urllib2.urlopen(request).read()
923 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
924 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
927 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
929 self._downloader.trouble(u'ERROR: Unable to extract id field')
931 yahoo_id = mobj.group(1)
933 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
935 self._downloader.trouble(u'ERROR: Unable to extract vid field')
937 yahoo_vid = mobj.group(1)
939 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
940 return self._real_extract(url, new_video=False)
942 # Retrieve video webpage to extract further information
943 request = urllib2.Request(url)
945 self.report_download_webpage(video_id)
946 webpage = urllib2.urlopen(request).read()
947 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
948 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
951 # Extract uploader and title from webpage
952 self.report_extraction(video_id)
953 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
955 self._downloader.trouble(u'ERROR: unable to extract video title')
957 video_title = mobj.group(1).decode('utf-8')
959 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
961 self._downloader.trouble(u'ERROR: unable to extract video uploader')
963 video_uploader = mobj.group(1).decode('utf-8')
965 # Extract video thumbnail
966 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
968 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
970 video_thumbnail = mobj.group(1).decode('utf-8')
972 # Extract video description
973 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
975 self._downloader.trouble(u'ERROR: unable to extract video description')
977 video_description = mobj.group(1).decode('utf-8')
978 if not video_description:
979 video_description = 'No description available.'
981 # Extract video height and width
982 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
984 self._downloader.trouble(u'ERROR: unable to extract video height')
986 yv_video_height = mobj.group(1)
988 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
990 self._downloader.trouble(u'ERROR: unable to extract video width')
992 yv_video_width = mobj.group(1)
994 # Retrieve video playlist to extract media URL
995 # I'm not completely sure what all these options are, but we
996 # seem to need most of them, otherwise the server sends a 401.
997 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
998 yv_bitrate = '700' # according to Wikipedia this is hard-coded
999 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1000 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1001 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1003 self.report_download_webpage(video_id)
1004 webpage = urllib2.urlopen(request).read()
1005 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1006 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1009 # Extract media URL from playlist XML
1010 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1012 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1014 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1015 video_url = unescapeHTML(video_url)
1018 'id': video_id.decode('utf-8'),
1020 'uploader': video_uploader,
1021 'upload_date': u'NA',
1022 'title': video_title,
1023 'ext': video_extension.decode('utf-8'),
1024 'thumbnail': video_thumbnail.decode('utf-8'),
1025 'description': video_description,
1029 class VimeoIE(InfoExtractor):
1030 """Information extractor for vimeo.com."""
1032 # _VALID_URL matches Vimeo URLs
1033 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1036 def __init__(self, downloader=None):
1037 InfoExtractor.__init__(self, downloader)
1039 def report_download_webpage(self, video_id):
1040 """Report webpage download."""
1041 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1043 def report_extraction(self, video_id):
1044 """Report information extraction."""
1045 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1047 def _real_extract(self, url, new_video=True):
1048 # Extract ID from URL
1049 mobj = re.match(self._VALID_URL, url)
1051 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1054 video_id = mobj.group(1)
1056 # Retrieve video webpage to extract further information
1057 request = urllib2.Request(url, None, std_headers)
1059 self.report_download_webpage(video_id)
1060 webpage = urllib2.urlopen(request).read()
1061 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1062 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1065 # Now we begin extracting as much information as we can from what we
1066 # retrieved. First we extract the information common to all extractors,
1067 # and latter we extract those that are Vimeo specific.
1068 self.report_extraction(video_id)
1070 # Extract the config JSON
1071 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1073 config = json.loads(config)
1075 self._downloader.trouble(u'ERROR: unable to extract info section')
1079 video_title = config["video"]["title"]
1082 video_uploader = config["video"]["owner"]["name"]
1084 # Extract video thumbnail
1085 video_thumbnail = config["video"]["thumbnail"]
1087 # Extract video description
1088 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089 if video_description: video_description = clean_html(video_description)
1090 else: video_description = ''
1092 # Extract upload date
1093 video_upload_date = u'NA'
1094 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095 if mobj is not None:
1096 video_upload_date = mobj.group(1)
1098 # Vimeo specific: extract request signature and timestamp
1099 sig = config['request']['signature']
1100 timestamp = config['request']['timestamp']
1102 # Vimeo specific: extract video codec and quality information
1103 # First consider quality, then codecs, then take everything
1104 # TODO bind to format param
1105 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106 files = { 'hd': [], 'sd': [], 'other': []}
1107 for codec_name, codec_extension in codecs:
1108 if codec_name in config["video"]["files"]:
1109 if 'hd' in config["video"]["files"][codec_name]:
1110 files['hd'].append((codec_name, codec_extension, 'hd'))
1111 elif 'sd' in config["video"]["files"][codec_name]:
1112 files['sd'].append((codec_name, codec_extension, 'sd'))
1114 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1116 for quality in ('hd', 'sd', 'other'):
1117 if len(files[quality]) > 0:
1118 video_quality = files[quality][0][2]
1119 video_codec = files[quality][0][0]
1120 video_extension = files[quality][0][1]
1121 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1124 self._downloader.trouble(u'ERROR: no known codec found')
1127 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1133 'uploader': video_uploader,
1134 'upload_date': video_upload_date,
1135 'title': video_title,
1136 'ext': video_extension,
1137 'thumbnail': video_thumbnail,
1138 'description': video_description,
1142 class GenericIE(InfoExtractor):
1143 """Generic last-resort information extractor."""
1146 IE_NAME = u'generic'
1148 def __init__(self, downloader=None):
1149 InfoExtractor.__init__(self, downloader)
1151 def report_download_webpage(self, video_id):
1152 """Report webpage download."""
1153 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1154 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1156 def report_extraction(self, video_id):
1157 """Report information extraction."""
1158 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1160 def report_following_redirect(self, new_url):
1161 """Report information extraction."""
1162 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1164 def _test_redirect(self, url):
1165 """Check if it is a redirect, like url shorteners, in case restart chain."""
1166 class HeadRequest(urllib2.Request):
1167 def get_method(self):
1170 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1172 Subclass the HTTPRedirectHandler to make it use our
1173 HeadRequest also on the redirected URL
1175 def redirect_request(self, req, fp, code, msg, headers, newurl):
1176 if code in (301, 302, 303, 307):
1177 newurl = newurl.replace(' ', '%20')
1178 newheaders = dict((k,v) for k,v in req.headers.items()
1179 if k.lower() not in ("content-length", "content-type"))
1180 return HeadRequest(newurl,
1182 origin_req_host=req.get_origin_req_host(),
1185 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1187 class HTTPMethodFallback(urllib2.BaseHandler):
1189 Fallback to GET if HEAD is not allowed (405 HTTP error)
1191 def http_error_405(self, req, fp, code, msg, headers):
1195 newheaders = dict((k,v) for k,v in req.headers.items()
1196 if k.lower() not in ("content-length", "content-type"))
1197 return self.parent.open(urllib2.Request(req.get_full_url(),
1199 origin_req_host=req.get_origin_req_host(),
1203 opener = urllib2.OpenerDirector()
1204 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1205 HTTPMethodFallback, HEADRedirectHandler,
1206 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1207 opener.add_handler(handler())
1209 response = opener.open(HeadRequest(url))
1210 new_url = response.geturl()
1212 if url == new_url: return False
1214 self.report_following_redirect(new_url)
1215 self._downloader.download([new_url])
1218 def _real_extract(self, url):
1219 if self._test_redirect(url): return
1221 video_id = url.split('/')[-1]
1222 request = urllib2.Request(url)
1224 self.report_download_webpage(video_id)
1225 webpage = urllib2.urlopen(request).read()
1226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1227 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1229 except ValueError, err:
1230 # since this is the last-resort InfoExtractor, if
1231 # this error is thrown, it'll be thrown here
1232 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1235 self.report_extraction(video_id)
1236 # Start with something easy: JW Player in SWFObject
1237 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1239 # Broaden the search a little bit
1240 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1242 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1245 # It's possible that one of the regexes
1246 # matched, but returned an empty group:
1247 if mobj.group(1) is None:
1248 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1251 video_url = urllib.unquote(mobj.group(1))
1252 video_id = os.path.basename(video_url)
1254 # here's a fun little line of code for you:
1255 video_extension = os.path.splitext(video_id)[1][1:]
1256 video_id = os.path.splitext(video_id)[0]
1258 # it's tempting to parse this further, but you would
1259 # have to take into account all the variations like
1260 # Video Title - Site Name
1261 # Site Name | Video Title
1262 # Video Title - Tagline | Site Name
1263 # and so on and so forth; it's just not practical
1264 mobj = re.search(r'<title>(.*)</title>', webpage)
1266 self._downloader.trouble(u'ERROR: unable to extract title')
1268 video_title = mobj.group(1).decode('utf-8')
1270 # video uploader is domain name
1271 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1273 self._downloader.trouble(u'ERROR: unable to extract title')
1275 video_uploader = mobj.group(1).decode('utf-8')
1278 'id': video_id.decode('utf-8'),
1279 'url': video_url.decode('utf-8'),
1280 'uploader': video_uploader,
1281 'upload_date': u'NA',
1282 'title': video_title,
1283 'ext': video_extension.decode('utf-8'),
1287 class YoutubeSearchIE(InfoExtractor):
1288 """Information Extractor for YouTube search queries."""
1289 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1290 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1291 _max_youtube_results = 1000
1292 IE_NAME = u'youtube:search'
1294 def __init__(self, downloader=None):
1295 InfoExtractor.__init__(self, downloader)
1297 def report_download_page(self, query, pagenum):
1298 """Report attempt to download search page with given number."""
1299 query = query.decode(preferredencoding())
1300 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1302 def _real_extract(self, query):
1303 mobj = re.match(self._VALID_URL, query)
1305 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1308 prefix, query = query.split(':')
1310 query = query.encode('utf-8')
1312 self._download_n_results(query, 1)
1314 elif prefix == 'all':
1315 self._download_n_results(query, self._max_youtube_results)
1321 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1323 elif n > self._max_youtube_results:
1324 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1325 n = self._max_youtube_results
1326 self._download_n_results(query, n)
1328 except ValueError: # parsing prefix as integer fails
1329 self._download_n_results(query, 1)
1332 def _download_n_results(self, query, n):
1333 """Downloads a specified number of results for a query"""
1339 while (50 * pagenum) < limit:
1340 self.report_download_page(query, pagenum+1)
1341 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1342 request = urllib2.Request(result_url)
1344 data = urllib2.urlopen(request).read()
1345 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1346 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1348 api_response = json.loads(data)['data']
1350 new_ids = list(video['id'] for video in api_response['items'])
1351 video_ids += new_ids
1353 limit = min(n, api_response['totalItems'])
1356 if len(video_ids) > n:
1357 video_ids = video_ids[:n]
1358 for id in video_ids:
1359 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1363 class GoogleSearchIE(InfoExtractor):
1364 """Information Extractor for Google Video search queries."""
1365 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1366 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1367 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1368 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1369 _max_google_results = 1000
1370 IE_NAME = u'video.google:search'
1372 def __init__(self, downloader=None):
1373 InfoExtractor.__init__(self, downloader)
1375 def report_download_page(self, query, pagenum):
1376 """Report attempt to download playlist page with given number."""
1377 query = query.decode(preferredencoding())
1378 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1380 def _real_extract(self, query):
1381 mobj = re.match(self._VALID_URL, query)
1383 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1386 prefix, query = query.split(':')
1388 query = query.encode('utf-8')
1390 self._download_n_results(query, 1)
1392 elif prefix == 'all':
1393 self._download_n_results(query, self._max_google_results)
1399 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1401 elif n > self._max_google_results:
1402 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1403 n = self._max_google_results
1404 self._download_n_results(query, n)
1406 except ValueError: # parsing prefix as integer fails
1407 self._download_n_results(query, 1)
1410 def _download_n_results(self, query, n):
1411 """Downloads a specified number of results for a query"""
1417 self.report_download_page(query, pagenum)
1418 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1419 request = urllib2.Request(result_url)
1421 page = urllib2.urlopen(request).read()
1422 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1423 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1426 # Extract video identifiers
1427 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1428 video_id = mobj.group(1)
1429 if video_id not in video_ids:
1430 video_ids.append(video_id)
1431 if len(video_ids) == n:
1432 # Specified n videos reached
1433 for id in video_ids:
1434 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1437 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1438 for id in video_ids:
1439 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1442 pagenum = pagenum + 1
1445 class YahooSearchIE(InfoExtractor):
1446 """Information Extractor for Yahoo! Video search queries."""
1447 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1448 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1449 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1450 _MORE_PAGES_INDICATOR = r'\s*Next'
1451 _max_yahoo_results = 1000
1452 IE_NAME = u'video.yahoo:search'
1454 def __init__(self, downloader=None):
1455 InfoExtractor.__init__(self, downloader)
1457 def report_download_page(self, query, pagenum):
1458 """Report attempt to download playlist page with given number."""
1459 query = query.decode(preferredencoding())
1460 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1462 def _real_extract(self, query):
1463 mobj = re.match(self._VALID_URL, query)
1465 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1468 prefix, query = query.split(':')
1470 query = query.encode('utf-8')
1472 self._download_n_results(query, 1)
1474 elif prefix == 'all':
1475 self._download_n_results(query, self._max_yahoo_results)
1481 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1483 elif n > self._max_yahoo_results:
1484 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1485 n = self._max_yahoo_results
1486 self._download_n_results(query, n)
1488 except ValueError: # parsing prefix as integer fails
1489 self._download_n_results(query, 1)
1492 def _download_n_results(self, query, n):
1493 """Downloads a specified number of results for a query"""
1496 already_seen = set()
1500 self.report_download_page(query, pagenum)
1501 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1502 request = urllib2.Request(result_url)
1504 page = urllib2.urlopen(request).read()
1505 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1506 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1509 # Extract video identifiers
1510 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1511 video_id = mobj.group(1)
1512 if video_id not in already_seen:
1513 video_ids.append(video_id)
1514 already_seen.add(video_id)
1515 if len(video_ids) == n:
1516 # Specified n videos reached
1517 for id in video_ids:
1518 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1521 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1522 for id in video_ids:
1523 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1526 pagenum = pagenum + 1
1529 class YoutubePlaylistIE(InfoExtractor):
1530 """Information Extractor for YouTube playlists."""
1532 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1533 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1534 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1535 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1536 IE_NAME = u'youtube:playlist'
1538 def __init__(self, downloader=None):
1539 InfoExtractor.__init__(self, downloader)
1541 def report_download_page(self, playlist_id, pagenum):
1542 """Report attempt to download playlist page with given number."""
1543 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1545 def _real_extract(self, url):
1546 # Extract playlist id
1547 mobj = re.match(self._VALID_URL, url)
1549 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1553 if mobj.group(3) is not None:
1554 self._downloader.download([mobj.group(3)])
1557 # Download playlist pages
1558 # prefix is 'p' as default for playlists but there are other types that need extra care
1559 playlist_prefix = mobj.group(1)
1560 if playlist_prefix == 'a':
1561 playlist_access = 'artist'
1563 playlist_prefix = 'p'
1564 playlist_access = 'view_play_list'
1565 playlist_id = mobj.group(2)
1570 self.report_download_page(playlist_id, pagenum)
1571 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1572 request = urllib2.Request(url)
1574 page = urllib2.urlopen(request).read()
1575 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1576 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1579 # Extract video identifiers
1581 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1582 if mobj.group(1) not in ids_in_page:
1583 ids_in_page.append(mobj.group(1))
1584 video_ids.extend(ids_in_page)
1586 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1588 pagenum = pagenum + 1
1590 playliststart = self._downloader.params.get('playliststart', 1) - 1
1591 playlistend = self._downloader.params.get('playlistend', -1)
1592 if playlistend == -1:
1593 video_ids = video_ids[playliststart:]
1595 video_ids = video_ids[playliststart:playlistend]
1597 for id in video_ids:
1598 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1602 class YoutubeChannelIE(InfoExtractor):
1603 """Information Extractor for YouTube channels."""
1605 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1606 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1607 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1608 IE_NAME = u'youtube:channel'
1610 def report_download_page(self, channel_id, pagenum):
1611 """Report attempt to download channel page with given number."""
1612 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1614 def _real_extract(self, url):
1615 # Extract channel id
1616 mobj = re.match(self._VALID_URL, url)
1618 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1621 # Download channel pages
1622 channel_id = mobj.group(1)
1627 self.report_download_page(channel_id, pagenum)
1628 url = self._TEMPLATE_URL % (channel_id, pagenum)
1629 request = urllib2.Request(url)
1631 page = urllib2.urlopen(request).read()
1632 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1633 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1636 # Extract video identifiers
1638 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1639 if mobj.group(1) not in ids_in_page:
1640 ids_in_page.append(mobj.group(1))
1641 video_ids.extend(ids_in_page)
1643 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1645 pagenum = pagenum + 1
1647 for id in video_ids:
1648 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1652 class YoutubeUserIE(InfoExtractor):
1653 """Information Extractor for YouTube users."""
1655 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1656 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1657 _GDATA_PAGE_SIZE = 50
1658 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1659 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1660 IE_NAME = u'youtube:user'
1662 def __init__(self, downloader=None):
1663 InfoExtractor.__init__(self, downloader)
1665 def report_download_page(self, username, start_index):
1666 """Report attempt to download user page."""
1667 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1668 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1670 def _real_extract(self, url):
1672 mobj = re.match(self._VALID_URL, url)
1674 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1677 username = mobj.group(1)
1679 # Download video ids using YouTube Data API. Result size per
1680 # query is limited (currently to 50 videos) so we need to query
1681 # page by page until there are no video ids - it means we got
1688 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1689 self.report_download_page(username, start_index)
1691 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1694 page = urllib2.urlopen(request).read()
1695 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1696 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1699 # Extract video identifiers
1702 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1703 if mobj.group(1) not in ids_in_page:
1704 ids_in_page.append(mobj.group(1))
1706 video_ids.extend(ids_in_page)
1708 # A little optimization - if current page is not
1709 # "full", ie. does not contain PAGE_SIZE video ids then
1710 # we can assume that this page is the last one - there
1711 # are no more ids on further pages - no need to query
1714 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1719 all_ids_count = len(video_ids)
1720 playliststart = self._downloader.params.get('playliststart', 1) - 1
1721 playlistend = self._downloader.params.get('playlistend', -1)
1723 if playlistend == -1:
1724 video_ids = video_ids[playliststart:]
1726 video_ids = video_ids[playliststart:playlistend]
1728 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1729 (username, all_ids_count, len(video_ids)))
1731 for video_id in video_ids:
1732 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1735 class BlipTVUserIE(InfoExtractor):
1736 """Information Extractor for blip.tv users."""
1738 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1740 IE_NAME = u'blip.tv:user'
1742 def __init__(self, downloader=None):
1743 InfoExtractor.__init__(self, downloader)
1745 def report_download_page(self, username, pagenum):
1746 """Report attempt to download user page."""
1747 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1748 (self.IE_NAME, username, pagenum))
1750 def _real_extract(self, url):
1752 mobj = re.match(self._VALID_URL, url)
1754 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1757 username = mobj.group(1)
1759 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1761 request = urllib2.Request(url)
1764 page = urllib2.urlopen(request).read().decode('utf-8')
1765 mobj = re.search(r'data-users-id="([^"]+)"', page)
1766 page_base = page_base % mobj.group(1)
1767 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1768 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1772 # Download video ids using BlipTV Ajax calls. Result size per
1773 # query is limited (currently to 12 videos) so we need to query
1774 # page by page until there are no video ids - it means we got
1781 self.report_download_page(username, pagenum)
1783 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1786 page = urllib2.urlopen(request).read().decode('utf-8')
1787 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1788 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1791 # Extract video identifiers
1794 for mobj in re.finditer(r'href="/([^"]+)"', page):
1795 if mobj.group(1) not in ids_in_page:
1796 ids_in_page.append(unescapeHTML(mobj.group(1)))
1798 video_ids.extend(ids_in_page)
1800 # A little optimization - if current page is not
1801 # "full", ie. does not contain PAGE_SIZE video ids then
1802 # we can assume that this page is the last one - there
1803 # are no more ids on further pages - no need to query
1806 if len(ids_in_page) < self._PAGE_SIZE:
1811 all_ids_count = len(video_ids)
1812 playliststart = self._downloader.params.get('playliststart', 1) - 1
1813 playlistend = self._downloader.params.get('playlistend', -1)
1815 if playlistend == -1:
1816 video_ids = video_ids[playliststart:]
1818 video_ids = video_ids[playliststart:playlistend]
1820 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1821 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1823 for video_id in video_ids:
1824 self._downloader.download([u'http://blip.tv/'+video_id])
1827 class DepositFilesIE(InfoExtractor):
1828 """Information extractor for depositfiles.com"""
1830 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1831 IE_NAME = u'DepositFiles'
1833 def __init__(self, downloader=None):
1834 InfoExtractor.__init__(self, downloader)
1836 def report_download_webpage(self, file_id):
1837 """Report webpage download."""
1838 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1840 def report_extraction(self, file_id):
1841 """Report information extraction."""
1842 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1844 def _real_extract(self, url):
1845 file_id = url.split('/')[-1]
1846 # Rebuild url in english locale
1847 url = 'http://depositfiles.com/en/files/' + file_id
1849 # Retrieve file webpage with 'Free download' button pressed
1850 free_download_indication = { 'gateway_result' : '1' }
1851 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1853 self.report_download_webpage(file_id)
1854 webpage = urllib2.urlopen(request).read()
1855 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1856 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1859 # Search for the real file URL
1860 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1861 if (mobj is None) or (mobj.group(1) is None):
1862 # Try to figure out reason of the error.
1863 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1864 if (mobj is not None) and (mobj.group(1) is not None):
1865 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1866 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1868 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1871 file_url = mobj.group(1)
1872 file_extension = os.path.splitext(file_url)[1][1:]
1874 # Search for file title
1875 mobj = re.search(r'<b title="(.*?)">', webpage)
1877 self._downloader.trouble(u'ERROR: unable to extract title')
1879 file_title = mobj.group(1).decode('utf-8')
1882 'id': file_id.decode('utf-8'),
1883 'url': file_url.decode('utf-8'),
1885 'upload_date': u'NA',
1886 'title': file_title,
1887 'ext': file_extension.decode('utf-8'),
1891 class FacebookIE(InfoExtractor):
1892 """Information Extractor for Facebook"""
1894 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1895 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1896 _NETRC_MACHINE = 'facebook'
1897 _available_formats = ['video', 'highqual', 'lowqual']
1898 _video_extensions = {
1903 IE_NAME = u'facebook'
1905 def __init__(self, downloader=None):
1906 InfoExtractor.__init__(self, downloader)
1908 def _reporter(self, message):
1909 """Add header and report message."""
1910 self._downloader.to_screen(u'[facebook] %s' % message)
1912 def report_login(self):
1913 """Report attempt to log in."""
1914 self._reporter(u'Logging in')
1916 def report_video_webpage_download(self, video_id):
1917 """Report attempt to download video webpage."""
1918 self._reporter(u'%s: Downloading video webpage' % video_id)
1920 def report_information_extraction(self, video_id):
1921 """Report attempt to extract video information."""
1922 self._reporter(u'%s: Extracting video information' % video_id)
1924 def _parse_page(self, video_webpage):
1925 """Extract video information from page"""
1927 data = {'title': r'\("video_title", "(.*?)"\)',
1928 'description': r'<div class="datawrap">(.*?)</div>',
1929 'owner': r'\("video_owner_name", "(.*?)"\)',
1930 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1933 for piece in data.keys():
1934 mobj = re.search(data[piece], video_webpage)
1935 if mobj is not None:
1936 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1940 for fmt in self._available_formats:
1941 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1942 if mobj is not None:
1943 # URL is in a Javascript segment inside an escaped Unicode format within
1944 # the generally utf-8 page
1945 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1946 video_info['video_urls'] = video_urls
1950 def _real_initialize(self):
1951 if self._downloader is None:
1956 downloader_params = self._downloader.params
1958 # Attempt to use provided username and password or .netrc data
1959 if downloader_params.get('username', None) is not None:
1960 useremail = downloader_params['username']
1961 password = downloader_params['password']
1962 elif downloader_params.get('usenetrc', False):
1964 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1965 if info is not None:
1969 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1970 except (IOError, netrc.NetrcParseError), err:
1971 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1974 if useremail is None:
1983 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1986 login_results = urllib2.urlopen(request).read()
1987 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1988 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1990 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1991 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1994 def _real_extract(self, url):
1995 mobj = re.match(self._VALID_URL, url)
1997 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1999 video_id = mobj.group('ID')
2002 self.report_video_webpage_download(video_id)
2003 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2005 page = urllib2.urlopen(request)
2006 video_webpage = page.read()
2007 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2008 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2011 # Start extracting information
2012 self.report_information_extraction(video_id)
2014 # Extract information
2015 video_info = self._parse_page(video_webpage)
2018 if 'owner' not in video_info:
2019 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2021 video_uploader = video_info['owner']
2024 if 'title' not in video_info:
2025 self._downloader.trouble(u'ERROR: unable to extract video title')
2027 video_title = video_info['title']
2028 video_title = video_title.decode('utf-8')
2031 if 'thumbnail' not in video_info:
2032 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2033 video_thumbnail = ''
2035 video_thumbnail = video_info['thumbnail']
2039 if 'upload_date' in video_info:
2040 upload_time = video_info['upload_date']
2041 timetuple = email.utils.parsedate_tz(upload_time)
2042 if timetuple is not None:
2044 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2049 video_description = video_info.get('description', 'No description available.')
2051 url_map = video_info['video_urls']
2052 if len(url_map.keys()) > 0:
2053 # Decide which formats to download
2054 req_format = self._downloader.params.get('format', None)
2055 format_limit = self._downloader.params.get('format_limit', None)
2057 if format_limit is not None and format_limit in self._available_formats:
2058 format_list = self._available_formats[self._available_formats.index(format_limit):]
2060 format_list = self._available_formats
2061 existing_formats = [x for x in format_list if x in url_map]
2062 if len(existing_formats) == 0:
2063 self._downloader.trouble(u'ERROR: no known formats available for video')
2065 if req_format is None:
2066 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2067 elif req_format == 'worst':
2068 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2069 elif req_format == '-1':
2070 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2073 if req_format not in url_map:
2074 self._downloader.trouble(u'ERROR: requested format not available')
2076 video_url_list = [(req_format, url_map[req_format])] # Specific format
2079 for format_param, video_real_url in video_url_list:
2081 video_extension = self._video_extensions.get(format_param, 'mp4')
2084 'id': video_id.decode('utf-8'),
2085 'url': video_real_url.decode('utf-8'),
2086 'uploader': video_uploader.decode('utf-8'),
2087 'upload_date': upload_date,
2088 'title': video_title,
2089 'ext': video_extension.decode('utf-8'),
2090 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2091 'thumbnail': video_thumbnail.decode('utf-8'),
2092 'description': video_description.decode('utf-8'),
2096 class BlipTVIE(InfoExtractor):
2097 """Information extractor for blip.tv"""
2099 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2100 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2101 IE_NAME = u'blip.tv'
2103 def report_extraction(self, file_id):
2104 """Report information extraction."""
2105 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2107 def report_direct_download(self, title):
2108 """Report information extraction."""
2109 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2111 def _real_extract(self, url):
2112 mobj = re.match(self._VALID_URL, url)
2114 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2121 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2122 request = urllib2.Request(json_url.encode('utf-8'))
2123 self.report_extraction(mobj.group(1))
2126 urlh = urllib2.urlopen(request)
2127 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2128 basename = url.split('/')[-1]
2129 title,ext = os.path.splitext(basename)
2130 title = title.decode('UTF-8')
2131 ext = ext.replace('.', '')
2132 self.report_direct_download(title)
2137 'upload_date': u'NA',
2142 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2143 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2145 if info is None: # Regular URL
2147 json_code = urlh.read()
2148 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2149 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2153 json_data = json.loads(json_code)
2154 if 'Post' in json_data:
2155 data = json_data['Post']
2159 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2160 video_url = data['media']['url']
2161 umobj = re.match(self._URL_EXT, video_url)
2163 raise ValueError('Can not determine filename extension')
2164 ext = umobj.group(1)
2167 'id': data['item_id'],
2169 'uploader': data['display_name'],
2170 'upload_date': upload_date,
2171 'title': data['title'],
2173 'format': data['media']['mimeType'],
2174 'thumbnail': data['thumbnailUrl'],
2175 'description': data['description'],
2176 'player_url': data['embedUrl']
2178 except (ValueError,KeyError), err:
2179 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2182 std_headers['User-Agent'] = 'iTunes/10.6.1'
2186 class MyVideoIE(InfoExtractor):
2187 """Information Extractor for myvideo.de."""
2189 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2190 IE_NAME = u'myvideo'
2192 def __init__(self, downloader=None):
2193 InfoExtractor.__init__(self, downloader)
2195 def report_download_webpage(self, video_id):
2196 """Report webpage download."""
2197 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2199 def report_extraction(self, video_id):
2200 """Report information extraction."""
2201 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2203 def _real_extract(self,url):
2204 mobj = re.match(self._VALID_URL, url)
2206 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2209 video_id = mobj.group(1)
2212 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2214 self.report_download_webpage(video_id)
2215 webpage = urllib2.urlopen(request).read()
2216 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2217 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2220 self.report_extraction(video_id)
2221 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2224 self._downloader.trouble(u'ERROR: unable to extract media URL')
2226 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2228 mobj = re.search('<title>([^<]+)</title>', webpage)
2230 self._downloader.trouble(u'ERROR: unable to extract title')
2233 video_title = mobj.group(1)
2239 'upload_date': u'NA',
2240 'title': video_title,
2244 class ComedyCentralIE(InfoExtractor):
2245 """Information extractor for The Daily Show and Colbert Report """
2247 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2248 IE_NAME = u'comedycentral'
2250 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2252 _video_extensions = {
2260 _video_dimensions = {
2269 def report_extraction(self, episode_id):
2270 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2272 def report_config_download(self, episode_id):
2273 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2275 def report_index_download(self, episode_id):
2276 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2278 def report_player_url(self, episode_id):
2279 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2282 def _print_formats(self, formats):
2283 print('Available formats:')
2285 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2288 def _real_extract(self, url):
2289 mobj = re.match(self._VALID_URL, url)
2291 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2294 if mobj.group('shortname'):
2295 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2296 url = u'http://www.thedailyshow.com/full-episodes/'
2298 url = u'http://www.colbertnation.com/full-episodes/'
2299 mobj = re.match(self._VALID_URL, url)
2300 assert mobj is not None
2302 dlNewest = not mobj.group('episode')
2304 epTitle = mobj.group('showname')
2306 epTitle = mobj.group('episode')
2308 req = urllib2.Request(url)
2309 self.report_extraction(epTitle)
2311 htmlHandle = urllib2.urlopen(req)
2312 html = htmlHandle.read()
2313 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2314 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2317 url = htmlHandle.geturl()
2318 mobj = re.match(self._VALID_URL, url)
2320 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2322 if mobj.group('episode') == '':
2323 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2325 epTitle = mobj.group('episode')
2327 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2329 if len(mMovieParams) == 0:
2330 # The Colbert Report embeds the information in a without
2331 # a URL prefix; so extract the alternate reference
2332 # and then add the URL prefix manually.
2334 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2335 if len(altMovieParams) == 0:
2336 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2339 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2341 playerUrl_raw = mMovieParams[0][0]
2342 self.report_player_url(epTitle)
2344 urlHandle = urllib2.urlopen(playerUrl_raw)
2345 playerUrl = urlHandle.geturl()
2346 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2347 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2350 uri = mMovieParams[0][1]
2351 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2352 self.report_index_download(epTitle)
2354 indexXml = urllib2.urlopen(indexUrl).read()
2355 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2356 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2361 idoc = xml.etree.ElementTree.fromstring(indexXml)
2362 itemEls = idoc.findall('.//item')
2363 for itemEl in itemEls:
2364 mediaId = itemEl.findall('./guid')[0].text
2365 shortMediaId = mediaId.split(':')[-1]
2366 showId = mediaId.split(':')[-2].replace('.com', '')
2367 officialTitle = itemEl.findall('./title')[0].text
2368 officialDate = itemEl.findall('./pubDate')[0].text
2370 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2371 urllib.urlencode({'uri': mediaId}))
2372 configReq = urllib2.Request(configUrl)
2373 self.report_config_download(epTitle)
2375 configXml = urllib2.urlopen(configReq).read()
2376 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2377 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2380 cdoc = xml.etree.ElementTree.fromstring(configXml)
2382 for rendition in cdoc.findall('.//rendition'):
2383 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2387 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2390 if self._downloader.params.get('listformats', None):
2391 self._print_formats([i[0] for i in turls])
2394 # For now, just pick the highest bitrate
2395 format,video_url = turls[-1]
2397 # Get the format arg from the arg stream
2398 req_format = self._downloader.params.get('format', None)
2400 # Select format if we can find one
2403 format, video_url = f, v
2406 # Patch to download from alternative CDN, which does not
2407 # break on current RTMPDump builds
2408 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2409 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2411 if video_url.startswith(broken_cdn):
2412 video_url = video_url.replace(broken_cdn, better_cdn)
2414 effTitle = showId + u'-' + epTitle
2419 'upload_date': officialDate,
2424 'description': officialTitle,
2425 'player_url': None #playerUrl
2428 results.append(info)
2433 class EscapistIE(InfoExtractor):
2434 """Information extractor for The Escapist """
2436 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2437 IE_NAME = u'escapist'
2439 def report_extraction(self, showName):
2440 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2442 def report_config_download(self, showName):
2443 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2445 def _real_extract(self, url):
2446 mobj = re.match(self._VALID_URL, url)
2448 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2450 showName = mobj.group('showname')
2451 videoId = mobj.group('episode')
2453 self.report_extraction(showName)
2455 webPage = urllib2.urlopen(url)
2456 webPageBytes = webPage.read()
2457 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2458 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2459 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2463 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2464 description = unescapeHTML(descMatch.group(1))
2465 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2466 imgUrl = unescapeHTML(imgMatch.group(1))
2467 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2468 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2469 configUrlMatch = re.search('config=(.*)$', playerUrl)
2470 configUrl = urllib2.unquote(configUrlMatch.group(1))
2472 self.report_config_download(showName)
2474 configJSON = urllib2.urlopen(configUrl).read()
2475 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2476 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2479 # Technically, it's JavaScript, not JSON
2480 configJSON = configJSON.replace("'", '"')
2483 config = json.loads(configJSON)
2484 except (ValueError,), err:
2485 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2488 playlist = config['playlist']
2489 videoUrl = playlist[1]['url']
2494 'uploader': showName,
2495 'upload_date': u'NA',
2498 'thumbnail': imgUrl,
2499 'description': description,
2500 'player_url': playerUrl,
2506 class CollegeHumorIE(InfoExtractor):
2507 """Information extractor for collegehumor.com"""
2509 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2510 IE_NAME = u'collegehumor'
2512 def report_webpage(self, video_id):
2513 """Report information extraction."""
2514 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2516 def report_extraction(self, video_id):
2517 """Report information extraction."""
2518 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2520 def _real_extract(self, url):
2521 mobj = re.match(self._VALID_URL, url)
2523 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2525 video_id = mobj.group('videoid')
2527 self.report_webpage(video_id)
2528 request = urllib2.Request(url)
2530 webpage = urllib2.urlopen(request).read()
2531 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2532 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2535 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2537 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2539 internal_video_id = m.group('internalvideoid')
2543 'internal_id': internal_video_id,
2545 'upload_date': u'NA',
2548 self.report_extraction(video_id)
2549 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2551 metaXml = urllib2.urlopen(xmlUrl).read()
2552 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2553 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2556 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2558 videoNode = mdoc.findall('./video')[0]
2559 info['description'] = videoNode.findall('./description')[0].text
2560 info['title'] = videoNode.findall('./caption')[0].text
2561 info['url'] = videoNode.findall('./file')[0].text
2562 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2563 info['ext'] = info['url'].rpartition('.')[2]
2565 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2571 class XVideosIE(InfoExtractor):
2572 """Information extractor for xvideos.com"""
2574 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2575 IE_NAME = u'xvideos'
2577 def report_webpage(self, video_id):
2578 """Report information extraction."""
2579 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2581 def report_extraction(self, video_id):
2582 """Report information extraction."""
2583 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2585 def _real_extract(self, url):
2586 mobj = re.match(self._VALID_URL, url)
2588 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2590 video_id = mobj.group(1).decode('utf-8')
2592 self.report_webpage(video_id)
2594 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2596 webpage = urllib2.urlopen(request).read()
2597 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2598 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2601 self.report_extraction(video_id)
2605 mobj = re.search(r'flv_url=(.+?)&', webpage)
2607 self._downloader.trouble(u'ERROR: unable to extract video url')
2609 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2613 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2615 self._downloader.trouble(u'ERROR: unable to extract video title')
2617 video_title = mobj.group(1).decode('utf-8')
2620 # Extract video thumbnail
2621 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2623 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2625 video_thumbnail = mobj.group(0).decode('utf-8')
2631 'upload_date': u'NA',
2632 'title': video_title,
2634 'thumbnail': video_thumbnail,
2635 'description': None,
2641 class SoundcloudIE(InfoExtractor):
2642 """Information extractor for soundcloud.com
2643 To access the media, the uid of the song and a stream token
2644 must be extracted from the page source and the script must make
2645 a request to media.soundcloud.com/crossdomain.xml. Then
2646 the media can be grabbed by requesting from an url composed
2647 of the stream token and uid
2650 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2651 IE_NAME = u'soundcloud'
2653 def __init__(self, downloader=None):
2654 InfoExtractor.__init__(self, downloader)
2656 def report_webpage(self, video_id):
2657 """Report information extraction."""
2658 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2660 def report_extraction(self, video_id):
2661 """Report information extraction."""
2662 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2664 def _real_extract(self, url):
2665 mobj = re.match(self._VALID_URL, url)
2667 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2670 # extract uploader (which is in the url)
2671 uploader = mobj.group(1).decode('utf-8')
2672 # extract simple title (uploader + slug of song title)
2673 slug_title = mobj.group(2).decode('utf-8')
2674 simple_title = uploader + u'-' + slug_title
2676 self.report_webpage('%s/%s' % (uploader, slug_title))
2678 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2680 webpage = urllib2.urlopen(request).read()
2681 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2682 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2685 self.report_extraction('%s/%s' % (uploader, slug_title))
2687 # extract uid and stream token that soundcloud hands out for access
2688 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2690 video_id = mobj.group(1)
2691 stream_token = mobj.group(2)
2693 # extract unsimplified title
2694 mobj = re.search('"title":"(.*?)",', webpage)
2696 title = mobj.group(1).decode('utf-8')
2698 title = simple_title
2700 # construct media url (with uid/token)
2701 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2702 mediaURL = mediaURL % (video_id, stream_token)
2705 description = u'No description available'
2706 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2708 description = mobj.group(1)
2712 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2715 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2716 except Exception, e:
2717 self._downloader.to_stderr(compat_str(e))
2719 # for soundcloud, a request to a cross domain is required for cookies
2720 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2723 'id': video_id.decode('utf-8'),
2725 'uploader': uploader.decode('utf-8'),
2726 'upload_date': upload_date,
2729 'description': description.decode('utf-8')
2733 class InfoQIE(InfoExtractor):
2734 """Information extractor for infoq.com"""
2736 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2739 def report_webpage(self, video_id):
2740 """Report information extraction."""
2741 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2743 def report_extraction(self, video_id):
2744 """Report information extraction."""
2745 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2747 def _real_extract(self, url):
2748 mobj = re.match(self._VALID_URL, url)
2750 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2753 self.report_webpage(url)
2755 request = urllib2.Request(url)
2757 webpage = urllib2.urlopen(request).read()
2758 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2759 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2762 self.report_extraction(url)
2766 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2768 self._downloader.trouble(u'ERROR: unable to extract video url')
2770 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2774 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2776 self._downloader.trouble(u'ERROR: unable to extract video title')
2778 video_title = mobj.group(1).decode('utf-8')
2780 # Extract description
2781 video_description = u'No description available.'
2782 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2783 if mobj is not None:
2784 video_description = mobj.group(1).decode('utf-8')
2786 video_filename = video_url.split('/')[-1]
2787 video_id, extension = video_filename.split('.')
2793 'upload_date': u'NA',
2794 'title': video_title,
2795 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2797 'description': video_description,
2802 class MixcloudIE(InfoExtractor):
2803 """Information extractor for www.mixcloud.com"""
2804 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2805 IE_NAME = u'mixcloud'
2807 def __init__(self, downloader=None):
2808 InfoExtractor.__init__(self, downloader)
2810 def report_download_json(self, file_id):
2811 """Report JSON download."""
2812 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2814 def report_extraction(self, file_id):
2815 """Report information extraction."""
2816 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2818 def get_urls(self, jsonData, fmt, bitrate='best'):
2819 """Get urls from 'audio_formats' section in json"""
2822 bitrate_list = jsonData[fmt]
2823 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2824 bitrate = max(bitrate_list) # select highest
2826 url_list = jsonData[fmt][bitrate]
2827 except TypeError: # we have no bitrate info.
2828 url_list = jsonData[fmt]
2831 def check_urls(self, url_list):
2832 """Returns 1st active url from list"""
2833 for url in url_list:
2835 urllib2.urlopen(url)
2837 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2842 def _print_formats(self, formats):
2843 print('Available formats:')
2844 for fmt in formats.keys():
2845 for b in formats[fmt]:
2847 ext = formats[fmt][b][0]
2848 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2849 except TypeError: # we have no bitrate info
2850 ext = formats[fmt][0]
2851 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2854 def _real_extract(self, url):
2855 mobj = re.match(self._VALID_URL, url)
2857 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2859 # extract uploader & filename from url
2860 uploader = mobj.group(1).decode('utf-8')
2861 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2863 # construct API request
2864 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2865 # retrieve .json file with links to files
2866 request = urllib2.Request(file_url)
2868 self.report_download_json(file_url)
2869 jsonData = urllib2.urlopen(request).read()
2870 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2871 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2875 json_data = json.loads(jsonData)
2876 player_url = json_data['player_swf_url']
2877 formats = dict(json_data['audio_formats'])
2879 req_format = self._downloader.params.get('format', None)
2882 if self._downloader.params.get('listformats', None):
2883 self._print_formats(formats)
2886 if req_format is None or req_format == 'best':
2887 for format_param in formats.keys():
2888 url_list = self.get_urls(formats, format_param)
2890 file_url = self.check_urls(url_list)
2891 if file_url is not None:
2894 if req_format not in formats.keys():
2895 self._downloader.trouble(u'ERROR: format is not available')
2898 url_list = self.get_urls(formats, req_format)
2899 file_url = self.check_urls(url_list)
2900 format_param = req_format
2903 'id': file_id.decode('utf-8'),
2904 'url': file_url.decode('utf-8'),
2905 'uploader': uploader.decode('utf-8'),
2906 'upload_date': u'NA',
2907 'title': json_data['name'],
2908 'ext': file_url.split('.')[-1].decode('utf-8'),
2909 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2910 'thumbnail': json_data['thumbnail_url'],
2911 'description': json_data['description'],
2912 'player_url': player_url.decode('utf-8'),
2915 class StanfordOpenClassroomIE(InfoExtractor):
2916 """Information extractor for Stanford's Open ClassRoom"""
2918 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2919 IE_NAME = u'stanfordoc'
2921 def report_download_webpage(self, objid):
2922 """Report information extraction."""
2923 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2925 def report_extraction(self, video_id):
2926 """Report information extraction."""
2927 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2929 def _real_extract(self, url):
2930 mobj = re.match(self._VALID_URL, url)
2932 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2935 if mobj.group('course') and mobj.group('video'): # A specific video
2936 course = mobj.group('course')
2937 video = mobj.group('video')
2939 'id': course + '_' + video,
2941 'upload_date': u'NA',
2944 self.report_extraction(info['id'])
2945 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2946 xmlUrl = baseUrl + video + '.xml'
2948 metaXml = urllib2.urlopen(xmlUrl).read()
2949 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2950 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2952 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2954 info['title'] = mdoc.findall('./title')[0].text
2955 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2957 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2959 info['ext'] = info['url'].rpartition('.')[2]
2961 elif mobj.group('course'): # A course page
2962 course = mobj.group('course')
2967 'upload_date': u'NA',
2970 self.report_download_webpage(info['id'])
2972 coursepage = urllib2.urlopen(url).read()
2973 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2974 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2977 m = re.search('<h1>([^<]+)</h1>', coursepage)
2979 info['title'] = unescapeHTML(m.group(1))
2981 info['title'] = info['id']
2983 m = re.search('<description>([^<]+)</description>', coursepage)
2985 info['description'] = unescapeHTML(m.group(1))
2987 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2990 'type': 'reference',
2991 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2995 for entry in info['list']:
2996 assert entry['type'] == 'reference'
2997 results += self.extract(entry['url'])
3002 'id': 'Stanford OpenClassroom',
3005 'upload_date': u'NA',
3008 self.report_download_webpage(info['id'])
3009 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3011 rootpage = urllib2.urlopen(rootURL).read()
3012 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3013 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3016 info['title'] = info['id']
3018 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3021 'type': 'reference',
3022 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3027 for entry in info['list']:
3028 assert entry['type'] == 'reference'
3029 results += self.extract(entry['url'])
3032 class MTVIE(InfoExtractor):
3033 """Information extractor for MTV.com"""
3035 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3038 def report_webpage(self, video_id):
3039 """Report information extraction."""
3040 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3042 def report_extraction(self, video_id):
3043 """Report information extraction."""
3044 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3046 def _real_extract(self, url):
3047 mobj = re.match(self._VALID_URL, url)
3049 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3051 if not mobj.group('proto'):
3052 url = 'http://' + url
3053 video_id = mobj.group('videoid')
3054 self.report_webpage(video_id)
3056 request = urllib2.Request(url)
3058 webpage = urllib2.urlopen(request).read()
3059 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3060 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3063 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3065 self._downloader.trouble(u'ERROR: unable to extract song name')
3067 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3068 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3070 self._downloader.trouble(u'ERROR: unable to extract performer')
3072 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3073 video_title = performer + ' - ' + song_name
3075 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3077 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3079 mtvn_uri = mobj.group(1)
3081 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3083 self._downloader.trouble(u'ERROR: unable to extract content id')
3085 content_id = mobj.group(1)
3087 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3088 self.report_extraction(video_id)
3089 request = urllib2.Request(videogen_url)
3091 metadataXml = urllib2.urlopen(request).read()
3092 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3093 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3096 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3097 renditions = mdoc.findall('.//rendition')
3099 # For now, always pick the highest quality.
3100 rendition = renditions[-1]
3103 _,_,ext = rendition.attrib['type'].partition('/')
3104 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3105 video_url = rendition.find('./src').text
3107 self._downloader.trouble('Invalid rendition field.')
3113 'uploader': performer,
3114 'upload_date': u'NA',
3115 'title': video_title,
3123 class YoukuIE(InfoExtractor):
3125 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3128 def __init__(self, downloader=None):
3129 InfoExtractor.__init__(self, downloader)
3131 def report_download_webpage(self, file_id):
3132 """Report webpage download."""
3133 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3135 def report_extraction(self, file_id):
3136 """Report information extraction."""
3137 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3140 nowTime = int(time.time() * 1000)
3141 random1 = random.randint(1000,1998)
3142 random2 = random.randint(1000,9999)
3144 return "%d%d%d" %(nowTime,random1,random2)
3146 def _get_file_ID_mix_string(self, seed):
3148 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3150 for i in range(len(source)):
3151 seed = (seed * 211 + 30031 ) % 65536
3152 index = math.floor(seed / 65536 * len(source) )
3153 mixed.append(source[int(index)])
3154 source.remove(source[int(index)])
3155 #return ''.join(mixed)
3158 def _get_file_id(self, fileId, seed):
3159 mixed = self._get_file_ID_mix_string(seed)
3160 ids = fileId.split('*')
3164 realId.append(mixed[int(ch)])
3165 return ''.join(realId)
3167 def _real_extract(self, url):
3168 mobj = re.match(self._VALID_URL, url)
3170 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3172 video_id = mobj.group('ID')
3174 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3176 request = urllib2.Request(info_url, None, std_headers)
3178 self.report_download_webpage(video_id)
3179 jsondata = urllib2.urlopen(request).read()
3180 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3181 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3184 self.report_extraction(video_id)
3186 config = json.loads(jsondata)
3188 video_title = config['data'][0]['title']
3189 seed = config['data'][0]['seed']
3191 format = self._downloader.params.get('format', None)
3192 supported_format = config['data'][0]['streamfileids'].keys()
3194 if format is None or format == 'best':
3195 if 'hd2' in supported_format:
3200 elif format == 'worst':
3208 fileid = config['data'][0]['streamfileids'][format]
3209 seg_number = len(config['data'][0]['segs'][format])
3212 for i in xrange(seg_number):
3213 keys.append(config['data'][0]['segs'][format][i]['k'])
3216 #youku only could be viewed from mainland china
3218 self._downloader.trouble(u'ERROR: unable to extract info section')
3222 sid = self._gen_sid()
3223 fileid = self._get_file_id(fileid, seed)
3225 #column 8,9 of fileid represent the segment number
3226 #fileid[7:9] should be changed
3227 for index, key in enumerate(keys):
3229 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3230 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3233 'id': '%s_part%02d' % (video_id, index),
3234 'url': download_url,
3236 'upload_date': u'NA',
3237 'title': video_title,
3240 files_info.append(info)
3245 class XNXXIE(InfoExtractor):
3246 """Information extractor for xnxx.com"""
3248 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3250 VIDEO_URL_RE = r'flv_url=(.*?)&'
3251 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3252 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3254 def report_webpage(self, video_id):
3255 """Report information extraction"""
3256 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3258 def report_extraction(self, video_id):
3259 """Report information extraction"""
3260 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3262 def _real_extract(self, url):
3263 mobj = re.match(self._VALID_URL, url)
3265 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3267 video_id = mobj.group(1).decode('utf-8')
3269 self.report_webpage(video_id)
3271 # Get webpage content
3273 webpage = urllib2.urlopen(url).read()
3274 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3278 result = re.search(self.VIDEO_URL_RE, webpage)
3280 self._downloader.trouble(u'ERROR: unable to extract video url')
3282 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3284 result = re.search(self.VIDEO_TITLE_RE, webpage)
3286 self._downloader.trouble(u'ERROR: unable to extract video title')
3288 video_title = result.group(1).decode('utf-8')
3290 result = re.search(self.VIDEO_THUMB_RE, webpage)
3292 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3294 video_thumbnail = result.group(1).decode('utf-8')
3300 'upload_date': u'NA',
3301 'title': video_title,
3303 'thumbnail': video_thumbnail,
3304 'description': None,
3308 class GooglePlusIE(InfoExtractor):
3309 """Information extractor for plus.google.com."""
3311 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3312 IE_NAME = u'plus.google'
3314 def __init__(self, downloader=None):
3315 InfoExtractor.__init__(self, downloader)
3317 def report_extract_entry(self, url):
3318 """Report downloading extry"""
3319 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3321 def report_date(self, upload_date):
3322 """Report downloading extry"""
3323 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3325 def report_uploader(self, uploader):
3326 """Report downloading extry"""
3327 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3329 def report_title(self, video_title):
3330 """Report downloading extry"""
3331 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3333 def report_extract_vid_page(self, video_page):
3334 """Report information extraction."""
3335 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3337 def _real_extract(self, url):
3338 # Extract id from URL
3339 mobj = re.match(self._VALID_URL, url)
3341 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3344 post_url = mobj.group(0)
3345 video_id = mobj.group(2)
3347 video_extension = 'flv'
3349 # Step 1, Retrieve post webpage to extract further information
3350 self.report_extract_entry(post_url)
3351 request = urllib2.Request(post_url)
3353 webpage = urllib2.urlopen(request).read()
3354 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3355 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3358 # Extract update date
3360 pattern = 'title="Timestamp">(.*?)</a>'
3361 mobj = re.search(pattern, webpage)
3363 upload_date = mobj.group(1)
3364 # Convert timestring to a format suitable for filename
3365 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3366 upload_date = upload_date.strftime('%Y%m%d')
3367 self.report_date(upload_date)
3371 pattern = r'rel\="author".*?>(.*?)</a>'
3372 mobj = re.search(pattern, webpage)
3374 uploader = mobj.group(1)
3375 self.report_uploader(uploader)
3378 # Get the first line for title
3380 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3381 mobj = re.search(pattern, webpage)
3383 video_title = mobj.group(1)
3384 self.report_title(video_title)
3386 # Step 2, Stimulate clicking the image box to launch video
3387 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3388 mobj = re.search(pattern, webpage)
3390 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3392 video_page = mobj.group(1)
3393 request = urllib2.Request(video_page)
3395 webpage = urllib2.urlopen(request).read()
3396 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3397 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3399 self.report_extract_vid_page(video_page)
3402 # Extract video links on video page
3403 """Extract video links of all sizes"""
3404 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3405 mobj = re.findall(pattern, webpage)
3407 self._downloader.trouble(u'ERROR: unable to extract video links')
3409 # Sort in resolution
3410 links = sorted(mobj)
3412 # Choose the lowest of the sort, i.e. highest resolution
3413 video_url = links[-1]
3414 # Only get the url. The resolution part in the tuple has no use anymore
3415 video_url = video_url[-1]
3416 # Treat escaped \u0026 style hex
3417 video_url = unicode(video_url, "unicode_escape")
3421 'id': video_id.decode('utf-8'),
3423 'uploader': uploader.decode('utf-8'),
3424 'upload_date': upload_date.decode('utf-8'),
3425 'title': video_title.decode('utf-8'),
3426 'ext': video_extension.decode('utf-8'),