2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
39 The dictionaries must include the following fields:
43 uploader: Nickname of the video uploader.
44 title: Video title, unescaped.
45 ext: Video filename extension.
47 The following fields are optional:
49 format: The video format, defaults to ext (used for --get-format)
50 thumbnail: Full URL to a video thumbnail image.
51 description One-line video description.
52 player_url: SWF Player URL (used for rtmpdump).
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
58 _real_extract() must return a *list* of information dictionaries as
65 def __init__(self, downloader=None):
66 """Constructor. Receives an optional downloader."""
68 self.set_downloader(downloader)
70 def suitable(self, url):
71 """Receives a URL and returns True if suitable for this IE."""
72 return re.match(self._VALID_URL, url) is not None
75 """Initializes an instance (authentication, etc)."""
77 self._real_initialize()
80 def extract(self, url):
81 """Extracts URL information and returns it in list of dicts."""
83 return self._real_extract(url)
85 def set_downloader(self, downloader):
86 """Sets the downloader for this IE."""
87 self._downloader = downloader
89 def _real_initialize(self):
90 """Real initialization process. Redefine in subclasses."""
93 def _real_extract(self, url):
94 """Real extraction process. Redefine in subclasses."""
98 class YoutubeIE(InfoExtractor):
99 """Information extractor for youtube.com."""
103 (?:https?://)? # http(s):// (optional)
104 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
105 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
106 (?:.*?\#/)? # handle anchor (#/) redirect urls
107 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
108 (?: # the various things that can precede the ID:
109 (?:(?:v|embed|e)/) # v/ or embed/ or e/
110 |(?: # or the v= param in all its forms
111 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
112 (?:\?|\#!?) # the params delimiter ? or # or #!
113 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
116 )? # optional -> youtube.com/xxxx is OK
117 )? # all until now is optional -> you can pass the naked ID
118 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
119 (?(1).+)? # if we found the ID, everything can follow
121 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
122 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
123 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
124 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
125 _NETRC_MACHINE = 'youtube'
126 # Listed in order of quality
127 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
128 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
129 _video_extensions = {
135 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
141 _video_dimensions = {
159 def suitable(self, url):
160 """Receives a URL and returns True if suitable for this IE."""
161 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
163 def report_lang(self):
164 """Report attempt to set language."""
165 self._downloader.to_screen(u'[youtube] Setting language')
167 def report_login(self):
168 """Report attempt to log in."""
169 self._downloader.to_screen(u'[youtube] Logging in')
171 def report_age_confirmation(self):
172 """Report attempt to confirm age."""
173 self._downloader.to_screen(u'[youtube] Confirming age')
175 def report_video_webpage_download(self, video_id):
176 """Report attempt to download video webpage."""
177 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
179 def report_video_info_webpage_download(self, video_id):
180 """Report attempt to download video info webpage."""
181 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
183 def report_video_subtitles_download(self, video_id):
184 """Report attempt to download video info webpage."""
185 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
187 def report_information_extraction(self, video_id):
188 """Report attempt to extract video information."""
189 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
191 def report_unavailable_format(self, video_id, format):
192 """Report extracted video URL."""
193 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
195 def report_rtmp_download(self):
196 """Indicate the download will use the RTMP protocol."""
197 self._downloader.to_screen(u'[youtube] RTMP download detected')
199 def _closed_captions_xml_to_srt(self, xml_string):
201 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
202 # TODO parse xml instead of regex
203 for n, (start, dur_tag, dur, caption) in enumerate(texts):
204 if not dur: dur = '4'
206 end = start + float(dur)
207 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
208 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
209 caption = unescapeHTML(caption)
210 caption = unescapeHTML(caption) # double cycle, intentional
211 srt += str(n+1) + '\n'
212 srt += start + ' --> ' + end + '\n'
213 srt += caption + '\n\n'
216 def _print_formats(self, formats):
217 print('Available formats:')
219 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
221 def _real_initialize(self):
222 if self._downloader is None:
227 downloader_params = self._downloader.params
229 # Attempt to use provided username and password or .netrc data
230 if downloader_params.get('username', None) is not None:
231 username = downloader_params['username']
232 password = downloader_params['password']
233 elif downloader_params.get('usenetrc', False):
235 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
240 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
241 except (IOError, netrc.NetrcParseError), err:
242 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
246 request = urllib2.Request(self._LANG_URL)
249 urllib2.urlopen(request).read()
250 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
251 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
254 # No authentication to be performed
260 'current_form': 'loginForm',
262 'action_login': 'Log In',
263 'username': username,
264 'password': password,
266 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
269 login_results = urllib2.urlopen(request).read()
270 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
271 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
274 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
280 'action_confirm': 'Confirm',
282 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
284 self.report_age_confirmation()
285 age_results = urllib2.urlopen(request).read()
286 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
287 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
290 def _real_extract(self, url):
291 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
292 mobj = re.search(self._NEXT_URL_RE, url)
294 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
296 # Extract video id from URL
297 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
299 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
301 video_id = mobj.group(2)
304 self.report_video_webpage_download(video_id)
305 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
307 video_webpage = urllib2.urlopen(request).read()
308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
309 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
312 # Attempt to extract SWF player URL
313 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
315 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
320 self.report_video_info_webpage_download(video_id)
321 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
322 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
323 % (video_id, el_type))
324 request = urllib2.Request(video_info_url)
326 video_info_webpage = urllib2.urlopen(request).read()
327 video_info = parse_qs(video_info_webpage)
328 if 'token' in video_info:
330 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
331 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
333 if 'token' not in video_info:
334 if 'reason' in video_info:
335 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
337 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
340 # Check for "rental" videos
341 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
342 self._downloader.trouble(u'ERROR: "rental" videos not supported')
345 # Start extracting information
346 self.report_information_extraction(video_id)
349 if 'author' not in video_info:
350 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
352 video_uploader = urllib.unquote_plus(video_info['author'][0])
355 if 'title' not in video_info:
356 self._downloader.trouble(u'ERROR: unable to extract video title')
358 video_title = urllib.unquote_plus(video_info['title'][0])
359 video_title = video_title.decode('utf-8')
362 if 'thumbnail_url' not in video_info:
363 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
365 else: # don't panic if we can't find it
366 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
370 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
372 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
373 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
374 for expression in format_expressions:
376 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
381 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
382 if video_description: video_description = clean_html(video_description)
383 else: video_description = ''
386 video_subtitles = None
387 if self._downloader.params.get('writesubtitles', False):
389 self.report_video_subtitles_download(video_id)
390 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
392 srt_list = urllib2.urlopen(request).read()
393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
394 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
395 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
396 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
397 if not srt_lang_list:
398 raise Trouble(u'WARNING: video has no closed captions')
399 if self._downloader.params.get('subtitleslang', False):
400 srt_lang = self._downloader.params.get('subtitleslang')
401 elif 'en' in srt_lang_list:
404 srt_lang = srt_lang_list.keys()[0]
405 if not srt_lang in srt_lang_list:
406 raise Trouble(u'WARNING: no closed captions found in the specified language')
407 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
409 srt_xml = urllib2.urlopen(request).read()
410 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
411 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
413 raise Trouble(u'WARNING: unable to download video subtitles')
414 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
415 except Trouble as trouble:
416 self._downloader.trouble(trouble[0])
418 if 'length_seconds' not in video_info:
419 self._downloader.trouble(u'WARNING: unable to extract video duration')
422 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
425 video_token = urllib.unquote_plus(video_info['token'][0])
427 # Decide which formats to download
428 req_format = self._downloader.params.get('format', None)
430 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
431 self.report_rtmp_download()
432 video_url_list = [(None, video_info['conn'][0])]
433 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
434 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
435 url_data = [parse_qs(uds) for uds in url_data_strs]
436 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
437 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
439 format_limit = self._downloader.params.get('format_limit', None)
440 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
441 if format_limit is not None and format_limit in available_formats:
442 format_list = available_formats[available_formats.index(format_limit):]
444 format_list = available_formats
445 existing_formats = [x for x in format_list if x in url_map]
446 if len(existing_formats) == 0:
447 self._downloader.trouble(u'ERROR: no known formats available for video')
449 if self._downloader.params.get('listformats', None):
450 self._print_formats(existing_formats)
452 if req_format is None or req_format == 'best':
453 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
454 elif req_format == 'worst':
455 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
456 elif req_format in ('-1', 'all'):
457 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
459 # Specific formats. We pick the first in a slash-delimeted sequence.
460 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
461 req_formats = req_format.split('/')
462 video_url_list = None
463 for rf in req_formats:
465 video_url_list = [(rf, url_map[rf])]
467 if video_url_list is None:
468 self._downloader.trouble(u'ERROR: requested format not available')
471 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
475 for format_param, video_real_url in video_url_list:
477 video_extension = self._video_extensions.get(format_param, 'flv')
479 video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
480 self._video_dimensions.get(format_param, '???'))
483 'id': video_id.decode('utf-8'),
484 'url': video_real_url.decode('utf-8'),
485 'uploader': video_uploader.decode('utf-8'),
486 'upload_date': upload_date,
487 'title': video_title,
488 'ext': video_extension.decode('utf-8'),
489 'format': video_format,
490 'thumbnail': video_thumbnail.decode('utf-8'),
491 'description': video_description,
492 'player_url': player_url,
493 'subtitles': video_subtitles,
494 'duration': video_duration
499 class MetacafeIE(InfoExtractor):
500 """Information Extractor for metacafe.com."""
502 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
503 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
504 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
505 IE_NAME = u'metacafe'
507 def __init__(self, downloader=None):
508 InfoExtractor.__init__(self, downloader)
510 def report_disclaimer(self):
511 """Report disclaimer retrieval."""
512 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
514 def report_age_confirmation(self):
515 """Report attempt to confirm age."""
516 self._downloader.to_screen(u'[metacafe] Confirming age')
518 def report_download_webpage(self, video_id):
519 """Report webpage download."""
520 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
522 def report_extraction(self, video_id):
523 """Report information extraction."""
524 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
526 def _real_initialize(self):
527 # Retrieve disclaimer
528 request = urllib2.Request(self._DISCLAIMER)
530 self.report_disclaimer()
531 disclaimer = urllib2.urlopen(request).read()
532 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
533 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
539 'submit': "Continue - I'm over 18",
541 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
543 self.report_age_confirmation()
544 disclaimer = urllib2.urlopen(request).read()
545 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
546 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
549 def _real_extract(self, url):
550 # Extract id and simplified title from URL
551 mobj = re.match(self._VALID_URL, url)
553 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
556 video_id = mobj.group(1)
558 # Check if video comes from YouTube
559 mobj2 = re.match(r'^yt-(.*)$', video_id)
560 if mobj2 is not None:
561 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
564 # Retrieve video webpage to extract further information
565 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
567 self.report_download_webpage(video_id)
568 webpage = urllib2.urlopen(request).read()
569 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
570 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
573 # Extract URL, uploader and title from webpage
574 self.report_extraction(video_id)
575 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
577 mediaURL = urllib.unquote(mobj.group(1))
578 video_extension = mediaURL[-3:]
580 # Extract gdaKey if available
581 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
585 gdaKey = mobj.group(1)
586 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
588 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
590 self._downloader.trouble(u'ERROR: unable to extract media URL')
592 vardict = parse_qs(mobj.group(1))
593 if 'mediaData' not in vardict:
594 self._downloader.trouble(u'ERROR: unable to extract media URL')
596 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
598 self._downloader.trouble(u'ERROR: unable to extract media URL')
600 mediaURL = mobj.group(1).replace('\\/', '/')
601 video_extension = mediaURL[-3:]
602 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
604 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
606 self._downloader.trouble(u'ERROR: unable to extract title')
608 video_title = mobj.group(1).decode('utf-8')
610 mobj = re.search(r'submitter=(.*?);', webpage)
612 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
614 video_uploader = mobj.group(1)
617 'id': video_id.decode('utf-8'),
618 'url': video_url.decode('utf-8'),
619 'uploader': video_uploader.decode('utf-8'),
620 'upload_date': u'NA',
621 'title': video_title,
622 'ext': video_extension.decode('utf-8'),
626 class DailymotionIE(InfoExtractor):
627 """Information Extractor for Dailymotion"""
629 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
630 IE_NAME = u'dailymotion'
632 def __init__(self, downloader=None):
633 InfoExtractor.__init__(self, downloader)
635 def report_download_webpage(self, video_id):
636 """Report webpage download."""
637 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
639 def report_extraction(self, video_id):
640 """Report information extraction."""
641 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
643 def _real_extract(self, url):
644 # Extract id and simplified title from URL
645 mobj = re.match(self._VALID_URL, url)
647 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
650 video_id = mobj.group(1).split('_')[0].split('?')[0]
652 video_extension = 'mp4'
654 # Retrieve video webpage to extract further information
655 request = urllib2.Request(url)
656 request.add_header('Cookie', 'family_filter=off')
658 self.report_download_webpage(video_id)
659 webpage = urllib2.urlopen(request).read()
660 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
661 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
664 # Extract URL, uploader and title from webpage
665 self.report_extraction(video_id)
666 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
668 self._downloader.trouble(u'ERROR: unable to extract media URL')
670 flashvars = urllib.unquote(mobj.group(1))
672 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
675 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
678 self._downloader.trouble(u'ERROR: unable to extract video URL')
681 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
683 self._downloader.trouble(u'ERROR: unable to extract video URL')
686 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
688 # TODO: support choosing qualities
690 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
692 self._downloader.trouble(u'ERROR: unable to extract title')
694 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
696 video_uploader = u'NA'
697 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
699 # lookin for official user
700 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
701 if mobj_official is None:
702 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
704 video_uploader = mobj_official.group(1)
706 video_uploader = mobj.group(1)
708 video_upload_date = u'NA'
709 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
711 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
714 'id': video_id.decode('utf-8'),
715 'url': video_url.decode('utf-8'),
716 'uploader': video_uploader.decode('utf-8'),
717 'upload_date': video_upload_date,
718 'title': video_title,
719 'ext': video_extension.decode('utf-8'),
723 class GoogleIE(InfoExtractor):
724 """Information extractor for video.google.com."""
726 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
727 IE_NAME = u'video.google'
729 def __init__(self, downloader=None):
730 InfoExtractor.__init__(self, downloader)
732 def report_download_webpage(self, video_id):
733 """Report webpage download."""
734 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
736 def report_extraction(self, video_id):
737 """Report information extraction."""
738 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
740 def _real_extract(self, url):
741 # Extract id from URL
742 mobj = re.match(self._VALID_URL, url)
744 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
747 video_id = mobj.group(1)
749 video_extension = 'mp4'
751 # Retrieve video webpage to extract further information
752 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
754 self.report_download_webpage(video_id)
755 webpage = urllib2.urlopen(request).read()
756 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
757 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
760 # Extract URL, uploader, and title from webpage
761 self.report_extraction(video_id)
762 mobj = re.search(r"download_url:'([^']+)'", webpage)
764 video_extension = 'flv'
765 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
767 self._downloader.trouble(u'ERROR: unable to extract media URL')
769 mediaURL = urllib.unquote(mobj.group(1))
770 mediaURL = mediaURL.replace('\\x3d', '\x3d')
771 mediaURL = mediaURL.replace('\\x26', '\x26')
775 mobj = re.search(r'<title>(.*)</title>', webpage)
777 self._downloader.trouble(u'ERROR: unable to extract title')
779 video_title = mobj.group(1).decode('utf-8')
781 # Extract video description
782 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
784 self._downloader.trouble(u'ERROR: unable to extract video description')
786 video_description = mobj.group(1).decode('utf-8')
787 if not video_description:
788 video_description = 'No description available.'
790 # Extract video thumbnail
791 if self._downloader.params.get('forcethumbnail', False):
792 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
794 webpage = urllib2.urlopen(request).read()
795 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
796 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
798 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
800 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
802 video_thumbnail = mobj.group(1)
803 else: # we need something to pass to process_info
807 'id': video_id.decode('utf-8'),
808 'url': video_url.decode('utf-8'),
810 'upload_date': u'NA',
811 'title': video_title,
812 'ext': video_extension.decode('utf-8'),
816 class PhotobucketIE(InfoExtractor):
817 """Information extractor for photobucket.com."""
819 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
820 IE_NAME = u'photobucket'
822 def __init__(self, downloader=None):
823 InfoExtractor.__init__(self, downloader)
825 def report_download_webpage(self, video_id):
826 """Report webpage download."""
827 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
829 def report_extraction(self, video_id):
830 """Report information extraction."""
831 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
833 def _real_extract(self, url):
834 # Extract id from URL
835 mobj = re.match(self._VALID_URL, url)
837 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
840 video_id = mobj.group(1)
842 video_extension = 'flv'
844 # Retrieve video webpage to extract further information
845 request = urllib2.Request(url)
847 self.report_download_webpage(video_id)
848 webpage = urllib2.urlopen(request).read()
849 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
850 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
853 # Extract URL, uploader, and title from webpage
854 self.report_extraction(video_id)
855 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
857 self._downloader.trouble(u'ERROR: unable to extract media URL')
859 mediaURL = urllib.unquote(mobj.group(1))
863 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
865 self._downloader.trouble(u'ERROR: unable to extract title')
867 video_title = mobj.group(1).decode('utf-8')
869 video_uploader = mobj.group(2).decode('utf-8')
872 'id': video_id.decode('utf-8'),
873 'url': video_url.decode('utf-8'),
874 'uploader': video_uploader,
875 'upload_date': u'NA',
876 'title': video_title,
877 'ext': video_extension.decode('utf-8'),
881 class YahooIE(InfoExtractor):
882 """Information extractor for video.yahoo.com."""
884 # _VALID_URL matches all Yahoo! Video URLs
885 # _VPAGE_URL matches only the extractable '/watch/' URLs
886 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
887 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
888 IE_NAME = u'video.yahoo'
890 def __init__(self, downloader=None):
891 InfoExtractor.__init__(self, downloader)
893 def report_download_webpage(self, video_id):
894 """Report webpage download."""
895 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
897 def report_extraction(self, video_id):
898 """Report information extraction."""
899 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
901 def _real_extract(self, url, new_video=True):
902 # Extract ID from URL
903 mobj = re.match(self._VALID_URL, url)
905 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
908 video_id = mobj.group(2)
909 video_extension = 'flv'
911 # Rewrite valid but non-extractable URLs as
912 # extractable English language /watch/ URLs
913 if re.match(self._VPAGE_URL, url) is None:
914 request = urllib2.Request(url)
916 webpage = urllib2.urlopen(request).read()
917 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
918 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
921 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
923 self._downloader.trouble(u'ERROR: Unable to extract id field')
925 yahoo_id = mobj.group(1)
927 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
929 self._downloader.trouble(u'ERROR: Unable to extract vid field')
931 yahoo_vid = mobj.group(1)
933 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
934 return self._real_extract(url, new_video=False)
936 # Retrieve video webpage to extract further information
937 request = urllib2.Request(url)
939 self.report_download_webpage(video_id)
940 webpage = urllib2.urlopen(request).read()
941 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
942 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
945 # Extract uploader and title from webpage
946 self.report_extraction(video_id)
947 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
949 self._downloader.trouble(u'ERROR: unable to extract video title')
951 video_title = mobj.group(1).decode('utf-8')
953 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
955 self._downloader.trouble(u'ERROR: unable to extract video uploader')
957 video_uploader = mobj.group(1).decode('utf-8')
959 # Extract video thumbnail
960 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
962 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
964 video_thumbnail = mobj.group(1).decode('utf-8')
966 # Extract video description
967 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
969 self._downloader.trouble(u'ERROR: unable to extract video description')
971 video_description = mobj.group(1).decode('utf-8')
972 if not video_description:
973 video_description = 'No description available.'
975 # Extract video height and width
976 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
978 self._downloader.trouble(u'ERROR: unable to extract video height')
980 yv_video_height = mobj.group(1)
982 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
984 self._downloader.trouble(u'ERROR: unable to extract video width')
986 yv_video_width = mobj.group(1)
988 # Retrieve video playlist to extract media URL
989 # I'm not completely sure what all these options are, but we
990 # seem to need most of them, otherwise the server sends a 401.
991 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
992 yv_bitrate = '700' # according to Wikipedia this is hard-coded
993 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
994 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
995 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
997 self.report_download_webpage(video_id)
998 webpage = urllib2.urlopen(request).read()
999 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1000 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1003 # Extract media URL from playlist XML
1004 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1006 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1008 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1009 video_url = unescapeHTML(video_url)
1012 'id': video_id.decode('utf-8'),
1014 'uploader': video_uploader,
1015 'upload_date': u'NA',
1016 'title': video_title,
1017 'ext': video_extension.decode('utf-8'),
1018 'thumbnail': video_thumbnail.decode('utf-8'),
1019 'description': video_description,
1020 'thumbnail': video_thumbnail,
1024 class VimeoIE(InfoExtractor):
1025 """Information extractor for vimeo.com."""
1027 # _VALID_URL matches Vimeo URLs
1028 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1031 def __init__(self, downloader=None):
1032 InfoExtractor.__init__(self, downloader)
1034 def report_download_webpage(self, video_id):
1035 """Report webpage download."""
1036 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1038 def report_extraction(self, video_id):
1039 """Report information extraction."""
1040 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1042 def _real_extract(self, url, new_video=True):
1043 # Extract ID from URL
1044 mobj = re.match(self._VALID_URL, url)
1046 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1049 video_id = mobj.group(1)
1051 # Retrieve video webpage to extract further information
1052 request = urllib2.Request(url, None, std_headers)
1054 self.report_download_webpage(video_id)
1055 webpage = urllib2.urlopen(request).read()
1056 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1057 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1060 # Now we begin extracting as much information as we can from what we
1061 # retrieved. First we extract the information common to all extractors,
1062 # and latter we extract those that are Vimeo specific.
1063 self.report_extraction(video_id)
1065 # Extract the config JSON
1066 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1068 config = json.loads(config)
1070 self._downloader.trouble(u'ERROR: unable to extract info section')
1074 video_title = config["video"]["title"]
1077 video_uploader = config["video"]["owner"]["name"]
1079 # Extract video thumbnail
1080 video_thumbnail = config["video"]["thumbnail"]
1082 # Extract video description
1083 video_description = get_element_by_id("description", webpage.decode('utf8'))
1084 if video_description: video_description = clean_html(video_description)
1085 else: video_description = ''
1087 # Extract upload date
1088 video_upload_date = u'NA'
1089 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1090 if mobj is not None:
1091 video_upload_date = mobj.group(1)
1093 # Vimeo specific: extract request signature and timestamp
1094 sig = config['request']['signature']
1095 timestamp = config['request']['timestamp']
1097 # Vimeo specific: extract video codec and quality information
1098 # First consider quality, then codecs, then take everything
1099 # TODO bind to format param
1100 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1101 files = { 'hd': [], 'sd': [], 'other': []}
1102 for codec_name, codec_extension in codecs:
1103 if codec_name in config["video"]["files"]:
1104 if 'hd' in config["video"]["files"][codec_name]:
1105 files['hd'].append((codec_name, codec_extension, 'hd'))
1106 elif 'sd' in config["video"]["files"][codec_name]:
1107 files['sd'].append((codec_name, codec_extension, 'sd'))
1109 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1111 for quality in ('hd', 'sd', 'other'):
1112 if len(files[quality]) > 0:
1113 video_quality = files[quality][0][2]
1114 video_codec = files[quality][0][0]
1115 video_extension = files[quality][0][1]
1116 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1119 self._downloader.trouble(u'ERROR: no known codec found')
1122 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1123 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1128 'uploader': video_uploader,
1129 'upload_date': video_upload_date,
1130 'title': video_title,
1131 'ext': video_extension,
1132 'thumbnail': video_thumbnail,
1133 'description': video_description,
1137 class GenericIE(InfoExtractor):
1138 """Generic last-resort information extractor."""
1141 IE_NAME = u'generic'
1143 def __init__(self, downloader=None):
1144 InfoExtractor.__init__(self, downloader)
1146 def report_download_webpage(self, video_id):
1147 """Report webpage download."""
1148 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1149 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1151 def report_extraction(self, video_id):
1152 """Report information extraction."""
1153 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1155 def report_following_redirect(self, new_url):
1156 """Report information extraction."""
1157 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1159 def _test_redirect(self, url):
1160 """Check if it is a redirect, like url shorteners, in case restart chain."""
1161 class HeadRequest(urllib2.Request):
1162 def get_method(self):
1165 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1167 Subclass the HTTPRedirectHandler to make it use our
1168 HeadRequest also on the redirected URL
1170 def redirect_request(self, req, fp, code, msg, headers, newurl):
1171 if code in (301, 302, 303, 307):
1172 newurl = newurl.replace(' ', '%20')
1173 newheaders = dict((k,v) for k,v in req.headers.items()
1174 if k.lower() not in ("content-length", "content-type"))
1175 return HeadRequest(newurl,
1177 origin_req_host=req.get_origin_req_host(),
1180 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1182 class HTTPMethodFallback(urllib2.BaseHandler):
1184 Fallback to GET if HEAD is not allowed (405 HTTP error)
1186 def http_error_405(self, req, fp, code, msg, headers):
1190 newheaders = dict((k,v) for k,v in req.headers.items()
1191 if k.lower() not in ("content-length", "content-type"))
1192 return self.parent.open(urllib2.Request(req.get_full_url(),
1194 origin_req_host=req.get_origin_req_host(),
1198 opener = urllib2.OpenerDirector()
1199 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1200 HTTPMethodFallback, HEADRedirectHandler,
1201 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1202 opener.add_handler(handler())
1204 response = opener.open(HeadRequest(url))
1205 new_url = response.geturl()
1207 if url == new_url: return False
1209 self.report_following_redirect(new_url)
1210 self._downloader.download([new_url])
1213 def _real_extract(self, url):
1214 if self._test_redirect(url): return
1216 video_id = url.split('/')[-1]
1217 request = urllib2.Request(url)
1219 self.report_download_webpage(video_id)
1220 webpage = urllib2.urlopen(request).read()
1221 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1222 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1224 except ValueError, err:
1225 # since this is the last-resort InfoExtractor, if
1226 # this error is thrown, it'll be thrown here
1227 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1230 self.report_extraction(video_id)
1231 # Start with something easy: JW Player in SWFObject
1232 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1234 # Broaden the search a little bit
1235 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1237 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1240 # It's possible that one of the regexes
1241 # matched, but returned an empty group:
1242 if mobj.group(1) is None:
1243 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1246 video_url = urllib.unquote(mobj.group(1))
1247 video_id = os.path.basename(video_url)
1249 # here's a fun little line of code for you:
1250 video_extension = os.path.splitext(video_id)[1][1:]
1251 video_id = os.path.splitext(video_id)[0]
1253 # it's tempting to parse this further, but you would
1254 # have to take into account all the variations like
1255 # Video Title - Site Name
1256 # Site Name | Video Title
1257 # Video Title - Tagline | Site Name
1258 # and so on and so forth; it's just not practical
1259 mobj = re.search(r'<title>(.*)</title>', webpage)
1261 self._downloader.trouble(u'ERROR: unable to extract title')
1263 video_title = mobj.group(1).decode('utf-8')
1265 # video uploader is domain name
1266 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1268 self._downloader.trouble(u'ERROR: unable to extract title')
1270 video_uploader = mobj.group(1).decode('utf-8')
1273 'id': video_id.decode('utf-8'),
1274 'url': video_url.decode('utf-8'),
1275 'uploader': video_uploader,
1276 'upload_date': u'NA',
1277 'title': video_title,
1278 'ext': video_extension.decode('utf-8'),
1282 class YoutubeSearchIE(InfoExtractor):
1283 """Information Extractor for YouTube search queries."""
1284 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1285 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1286 _max_youtube_results = 1000
1287 IE_NAME = u'youtube:search'
1289 def __init__(self, downloader=None):
1290 InfoExtractor.__init__(self, downloader)
1292 def report_download_page(self, query, pagenum):
1293 """Report attempt to download search page with given number."""
1294 query = query.decode(preferredencoding())
1295 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1297 def _real_extract(self, query):
1298 mobj = re.match(self._VALID_URL, query)
1300 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1303 prefix, query = query.split(':')
1305 query = query.encode('utf-8')
1307 self._download_n_results(query, 1)
1309 elif prefix == 'all':
1310 self._download_n_results(query, self._max_youtube_results)
1316 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1318 elif n > self._max_youtube_results:
1319 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1320 n = self._max_youtube_results
1321 self._download_n_results(query, n)
1323 except ValueError: # parsing prefix as integer fails
1324 self._download_n_results(query, 1)
1327 def _download_n_results(self, query, n):
1328 """Downloads a specified number of results for a query"""
1334 while (50 * pagenum) < limit:
1335 self.report_download_page(query, pagenum+1)
1336 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1337 request = urllib2.Request(result_url)
1339 data = urllib2.urlopen(request).read()
1340 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1341 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1343 api_response = json.loads(data)['data']
1345 new_ids = list(video['id'] for video in api_response['items'])
1346 video_ids += new_ids
1348 limit = min(n, api_response['totalItems'])
1351 if len(video_ids) > n:
1352 video_ids = video_ids[:n]
1353 for id in video_ids:
1354 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1358 class GoogleSearchIE(InfoExtractor):
1359 """Information Extractor for Google Video search queries."""
1360 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1361 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1362 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1363 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1364 _max_google_results = 1000
1365 IE_NAME = u'video.google:search'
1367 def __init__(self, downloader=None):
1368 InfoExtractor.__init__(self, downloader)
1370 def report_download_page(self, query, pagenum):
1371 """Report attempt to download playlist page with given number."""
1372 query = query.decode(preferredencoding())
1373 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1375 def _real_extract(self, query):
1376 mobj = re.match(self._VALID_URL, query)
1378 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1381 prefix, query = query.split(':')
1383 query = query.encode('utf-8')
1385 self._download_n_results(query, 1)
1387 elif prefix == 'all':
1388 self._download_n_results(query, self._max_google_results)
1394 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1396 elif n > self._max_google_results:
1397 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1398 n = self._max_google_results
1399 self._download_n_results(query, n)
1401 except ValueError: # parsing prefix as integer fails
1402 self._download_n_results(query, 1)
1405 def _download_n_results(self, query, n):
1406 """Downloads a specified number of results for a query"""
1412 self.report_download_page(query, pagenum)
1413 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1414 request = urllib2.Request(result_url)
1416 page = urllib2.urlopen(request).read()
1417 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1418 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1421 # Extract video identifiers
1422 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1423 video_id = mobj.group(1)
1424 if video_id not in video_ids:
1425 video_ids.append(video_id)
1426 if len(video_ids) == n:
1427 # Specified n videos reached
1428 for id in video_ids:
1429 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1432 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1433 for id in video_ids:
1434 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1437 pagenum = pagenum + 1
1440 class YahooSearchIE(InfoExtractor):
1441 """Information Extractor for Yahoo! Video search queries."""
1442 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1443 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1444 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1445 _MORE_PAGES_INDICATOR = r'\s*Next'
1446 _max_yahoo_results = 1000
1447 IE_NAME = u'video.yahoo:search'
1449 def __init__(self, downloader=None):
1450 InfoExtractor.__init__(self, downloader)
1452 def report_download_page(self, query, pagenum):
1453 """Report attempt to download playlist page with given number."""
1454 query = query.decode(preferredencoding())
1455 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1457 def _real_extract(self, query):
1458 mobj = re.match(self._VALID_URL, query)
1460 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1463 prefix, query = query.split(':')
1465 query = query.encode('utf-8')
1467 self._download_n_results(query, 1)
1469 elif prefix == 'all':
1470 self._download_n_results(query, self._max_yahoo_results)
1476 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1478 elif n > self._max_yahoo_results:
1479 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1480 n = self._max_yahoo_results
1481 self._download_n_results(query, n)
1483 except ValueError: # parsing prefix as integer fails
1484 self._download_n_results(query, 1)
1487 def _download_n_results(self, query, n):
1488 """Downloads a specified number of results for a query"""
1491 already_seen = set()
1495 self.report_download_page(query, pagenum)
1496 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1497 request = urllib2.Request(result_url)
1499 page = urllib2.urlopen(request).read()
1500 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1501 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1504 # Extract video identifiers
1505 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1506 video_id = mobj.group(1)
1507 if video_id not in already_seen:
1508 video_ids.append(video_id)
1509 already_seen.add(video_id)
1510 if len(video_ids) == n:
1511 # Specified n videos reached
1512 for id in video_ids:
1513 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1516 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1517 for id in video_ids:
1518 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1521 pagenum = pagenum + 1
1524 class YoutubePlaylistIE(InfoExtractor):
1525 """Information Extractor for YouTube playlists."""
1527 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1528 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1529 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1530 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1531 IE_NAME = u'youtube:playlist'
1533 def __init__(self, downloader=None):
1534 InfoExtractor.__init__(self, downloader)
1536 def report_download_page(self, playlist_id, pagenum):
1537 """Report attempt to download playlist page with given number."""
1538 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1540 def _real_extract(self, url):
1541 # Extract playlist id
1542 mobj = re.match(self._VALID_URL, url)
1544 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1548 if mobj.group(3) is not None:
1549 self._downloader.download([mobj.group(3)])
1552 # Download playlist pages
1553 # prefix is 'p' as default for playlists but there are other types that need extra care
1554 playlist_prefix = mobj.group(1)
1555 if playlist_prefix == 'a':
1556 playlist_access = 'artist'
1558 playlist_prefix = 'p'
1559 playlist_access = 'view_play_list'
1560 playlist_id = mobj.group(2)
1565 self.report_download_page(playlist_id, pagenum)
1566 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1567 request = urllib2.Request(url)
1569 page = urllib2.urlopen(request).read()
1570 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1571 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1574 # Extract video identifiers
1576 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1577 if mobj.group(1) not in ids_in_page:
1578 ids_in_page.append(mobj.group(1))
1579 video_ids.extend(ids_in_page)
1581 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1583 pagenum = pagenum + 1
1585 playliststart = self._downloader.params.get('playliststart', 1) - 1
1586 playlistend = self._downloader.params.get('playlistend', -1)
1587 if playlistend == -1:
1588 video_ids = video_ids[playliststart:]
1590 video_ids = video_ids[playliststart:playlistend]
1592 for id in video_ids:
1593 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1597 class YoutubeChannelIE(InfoExtractor):
1598 """Information Extractor for YouTube channels."""
1600 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1601 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1602 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1603 IE_NAME = u'youtube:channel'
1605 def report_download_page(self, channel_id, pagenum):
1606 """Report attempt to download channel page with given number."""
1607 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1609 def _real_extract(self, url):
1610 # Extract channel id
1611 mobj = re.match(self._VALID_URL, url)
1613 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1616 # Download channel pages
1617 channel_id = mobj.group(1)
1622 self.report_download_page(channel_id, pagenum)
1623 url = self._TEMPLATE_URL % (channel_id, pagenum)
1624 request = urllib2.Request(url)
1626 page = urllib2.urlopen(request).read()
1627 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1628 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1631 # Extract video identifiers
1633 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1634 if mobj.group(1) not in ids_in_page:
1635 ids_in_page.append(mobj.group(1))
1636 video_ids.extend(ids_in_page)
1638 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1640 pagenum = pagenum + 1
1642 for id in video_ids:
1643 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1647 class YoutubeUserIE(InfoExtractor):
1648 """Information Extractor for YouTube users."""
1650 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1651 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1652 _GDATA_PAGE_SIZE = 50
1653 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1654 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1655 IE_NAME = u'youtube:user'
1657 def __init__(self, downloader=None):
1658 InfoExtractor.__init__(self, downloader)
1660 def report_download_page(self, username, start_index):
1661 """Report attempt to download user page."""
1662 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1663 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1665 def _real_extract(self, url):
1667 mobj = re.match(self._VALID_URL, url)
1669 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1672 username = mobj.group(1)
1674 # Download video ids using YouTube Data API. Result size per
1675 # query is limited (currently to 50 videos) so we need to query
1676 # page by page until there are no video ids - it means we got
1683 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1684 self.report_download_page(username, start_index)
1686 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1689 page = urllib2.urlopen(request).read()
1690 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1691 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1694 # Extract video identifiers
1697 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1698 if mobj.group(1) not in ids_in_page:
1699 ids_in_page.append(mobj.group(1))
1701 video_ids.extend(ids_in_page)
1703 # A little optimization - if current page is not
1704 # "full", ie. does not contain PAGE_SIZE video ids then
1705 # we can assume that this page is the last one - there
1706 # are no more ids on further pages - no need to query
1709 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1714 all_ids_count = len(video_ids)
1715 playliststart = self._downloader.params.get('playliststart', 1) - 1
1716 playlistend = self._downloader.params.get('playlistend', -1)
1718 if playlistend == -1:
1719 video_ids = video_ids[playliststart:]
1721 video_ids = video_ids[playliststart:playlistend]
1723 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1724 (username, all_ids_count, len(video_ids)))
1726 for video_id in video_ids:
1727 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1730 class BlipTVUserIE(InfoExtractor):
1731 """Information Extractor for blip.tv users."""
1733 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1735 IE_NAME = u'blip.tv:user'
1737 def __init__(self, downloader=None):
1738 InfoExtractor.__init__(self, downloader)
1740 def report_download_page(self, username, pagenum):
1741 """Report attempt to download user page."""
1742 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1743 (self.IE_NAME, username, pagenum))
1745 def _real_extract(self, url):
1747 mobj = re.match(self._VALID_URL, url)
1749 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1752 username = mobj.group(1)
1754 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1756 request = urllib2.Request(url)
1759 page = urllib2.urlopen(request).read().decode('utf-8')
1760 mobj = re.search(r'data-users-id="([^"]+)"', page)
1761 page_base = page_base % mobj.group(1)
1762 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1763 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1767 # Download video ids using BlipTV Ajax calls. Result size per
1768 # query is limited (currently to 12 videos) so we need to query
1769 # page by page until there are no video ids - it means we got
1776 self.report_download_page(username, pagenum)
1778 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1781 page = urllib2.urlopen(request).read().decode('utf-8')
1782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1783 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1786 # Extract video identifiers
1789 for mobj in re.finditer(r'href="/([^"]+)"', page):
1790 if mobj.group(1) not in ids_in_page:
1791 ids_in_page.append(unescapeHTML(mobj.group(1)))
1793 video_ids.extend(ids_in_page)
1795 # A little optimization - if current page is not
1796 # "full", ie. does not contain PAGE_SIZE video ids then
1797 # we can assume that this page is the last one - there
1798 # are no more ids on further pages - no need to query
1801 if len(ids_in_page) < self._PAGE_SIZE:
1806 all_ids_count = len(video_ids)
1807 playliststart = self._downloader.params.get('playliststart', 1) - 1
1808 playlistend = self._downloader.params.get('playlistend', -1)
1810 if playlistend == -1:
1811 video_ids = video_ids[playliststart:]
1813 video_ids = video_ids[playliststart:playlistend]
1815 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1816 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1818 for video_id in video_ids:
1819 self._downloader.download([u'http://blip.tv/'+video_id])
1822 class DepositFilesIE(InfoExtractor):
1823 """Information extractor for depositfiles.com"""
1825 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1826 IE_NAME = u'DepositFiles'
1828 def __init__(self, downloader=None):
1829 InfoExtractor.__init__(self, downloader)
1831 def report_download_webpage(self, file_id):
1832 """Report webpage download."""
1833 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1835 def report_extraction(self, file_id):
1836 """Report information extraction."""
1837 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1839 def _real_extract(self, url):
1840 file_id = url.split('/')[-1]
1841 # Rebuild url in english locale
1842 url = 'http://depositfiles.com/en/files/' + file_id
1844 # Retrieve file webpage with 'Free download' button pressed
1845 free_download_indication = { 'gateway_result' : '1' }
1846 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1848 self.report_download_webpage(file_id)
1849 webpage = urllib2.urlopen(request).read()
1850 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1851 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1854 # Search for the real file URL
1855 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1856 if (mobj is None) or (mobj.group(1) is None):
1857 # Try to figure out reason of the error.
1858 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1859 if (mobj is not None) and (mobj.group(1) is not None):
1860 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1861 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1863 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1866 file_url = mobj.group(1)
1867 file_extension = os.path.splitext(file_url)[1][1:]
1869 # Search for file title
1870 mobj = re.search(r'<b title="(.*?)">', webpage)
1872 self._downloader.trouble(u'ERROR: unable to extract title')
1874 file_title = mobj.group(1).decode('utf-8')
1877 'id': file_id.decode('utf-8'),
1878 'url': file_url.decode('utf-8'),
1880 'upload_date': u'NA',
1881 'title': file_title,
1882 'ext': file_extension.decode('utf-8'),
1886 class FacebookIE(InfoExtractor):
1887 """Information Extractor for Facebook"""
1889 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1890 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1891 _NETRC_MACHINE = 'facebook'
1892 _available_formats = ['video', 'highqual', 'lowqual']
1893 _video_extensions = {
1898 IE_NAME = u'facebook'
1900 def __init__(self, downloader=None):
1901 InfoExtractor.__init__(self, downloader)
1903 def _reporter(self, message):
1904 """Add header and report message."""
1905 self._downloader.to_screen(u'[facebook] %s' % message)
1907 def report_login(self):
1908 """Report attempt to log in."""
1909 self._reporter(u'Logging in')
1911 def report_video_webpage_download(self, video_id):
1912 """Report attempt to download video webpage."""
1913 self._reporter(u'%s: Downloading video webpage' % video_id)
1915 def report_information_extraction(self, video_id):
1916 """Report attempt to extract video information."""
1917 self._reporter(u'%s: Extracting video information' % video_id)
1919 def _parse_page(self, video_webpage):
1920 """Extract video information from page"""
1922 data = {'title': r'\("video_title", "(.*?)"\)',
1923 'description': r'<div class="datawrap">(.*?)</div>',
1924 'owner': r'\("video_owner_name", "(.*?)"\)',
1925 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1928 for piece in data.keys():
1929 mobj = re.search(data[piece], video_webpage)
1930 if mobj is not None:
1931 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1935 for fmt in self._available_formats:
1936 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1937 if mobj is not None:
1938 # URL is in a Javascript segment inside an escaped Unicode format within
1939 # the generally utf-8 page
1940 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1941 video_info['video_urls'] = video_urls
1945 def _real_initialize(self):
1946 if self._downloader is None:
1951 downloader_params = self._downloader.params
1953 # Attempt to use provided username and password or .netrc data
1954 if downloader_params.get('username', None) is not None:
1955 useremail = downloader_params['username']
1956 password = downloader_params['password']
1957 elif downloader_params.get('usenetrc', False):
1959 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1960 if info is not None:
1964 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1965 except (IOError, netrc.NetrcParseError), err:
1966 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1969 if useremail is None:
1978 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1981 login_results = urllib2.urlopen(request).read()
1982 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1983 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1985 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1986 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1989 def _real_extract(self, url):
1990 mobj = re.match(self._VALID_URL, url)
1992 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1994 video_id = mobj.group('ID')
1997 self.report_video_webpage_download(video_id)
1998 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2000 page = urllib2.urlopen(request)
2001 video_webpage = page.read()
2002 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2003 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2006 # Start extracting information
2007 self.report_information_extraction(video_id)
2009 # Extract information
2010 video_info = self._parse_page(video_webpage)
2013 if 'owner' not in video_info:
2014 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2016 video_uploader = video_info['owner']
2019 if 'title' not in video_info:
2020 self._downloader.trouble(u'ERROR: unable to extract video title')
2022 video_title = video_info['title']
2023 video_title = video_title.decode('utf-8')
2026 if 'thumbnail' not in video_info:
2027 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2028 video_thumbnail = ''
2030 video_thumbnail = video_info['thumbnail']
2034 if 'upload_date' in video_info:
2035 upload_time = video_info['upload_date']
2036 timetuple = email.utils.parsedate_tz(upload_time)
2037 if timetuple is not None:
2039 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2044 video_description = video_info.get('description', 'No description available.')
2046 url_map = video_info['video_urls']
2047 if len(url_map.keys()) > 0:
2048 # Decide which formats to download
2049 req_format = self._downloader.params.get('format', None)
2050 format_limit = self._downloader.params.get('format_limit', None)
2052 if format_limit is not None and format_limit in self._available_formats:
2053 format_list = self._available_formats[self._available_formats.index(format_limit):]
2055 format_list = self._available_formats
2056 existing_formats = [x for x in format_list if x in url_map]
2057 if len(existing_formats) == 0:
2058 self._downloader.trouble(u'ERROR: no known formats available for video')
2060 if req_format is None:
2061 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2062 elif req_format == 'worst':
2063 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2064 elif req_format == '-1':
2065 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2068 if req_format not in url_map:
2069 self._downloader.trouble(u'ERROR: requested format not available')
2071 video_url_list = [(req_format, url_map[req_format])] # Specific format
2074 for format_param, video_real_url in video_url_list:
2076 video_extension = self._video_extensions.get(format_param, 'mp4')
2079 'id': video_id.decode('utf-8'),
2080 'url': video_real_url.decode('utf-8'),
2081 'uploader': video_uploader.decode('utf-8'),
2082 'upload_date': upload_date,
2083 'title': video_title,
2084 'ext': video_extension.decode('utf-8'),
2085 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2086 'thumbnail': video_thumbnail.decode('utf-8'),
2087 'description': video_description.decode('utf-8'),
2091 class BlipTVIE(InfoExtractor):
2092 """Information extractor for blip.tv"""
2094 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2095 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2096 IE_NAME = u'blip.tv'
2098 def report_extraction(self, file_id):
2099 """Report information extraction."""
2100 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2102 def report_direct_download(self, title):
2103 """Report information extraction."""
2104 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2106 def _real_extract(self, url):
2107 mobj = re.match(self._VALID_URL, url)
2109 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2116 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2117 request = urllib2.Request(json_url.encode('utf-8'))
2118 self.report_extraction(mobj.group(1))
2121 urlh = urllib2.urlopen(request)
2122 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2123 basename = url.split('/')[-1]
2124 title,ext = os.path.splitext(basename)
2125 title = title.decode('UTF-8')
2126 ext = ext.replace('.', '')
2127 self.report_direct_download(title)
2135 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2136 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2138 if info is None: # Regular URL
2140 json_code = urlh.read()
2141 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2146 json_data = json.loads(json_code)
2147 if 'Post' in json_data:
2148 data = json_data['Post']
2152 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2153 video_url = data['media']['url']
2154 umobj = re.match(self._URL_EXT, video_url)
2156 raise ValueError('Can not determine filename extension')
2157 ext = umobj.group(1)
2160 'id': data['item_id'],
2162 'uploader': data['display_name'],
2163 'upload_date': upload_date,
2164 'title': data['title'],
2166 'format': data['media']['mimeType'],
2167 'thumbnail': data['thumbnailUrl'],
2168 'description': data['description'],
2169 'player_url': data['embedUrl']
2171 except (ValueError,KeyError), err:
2172 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2175 std_headers['User-Agent'] = 'iTunes/10.6.1'
2179 class MyVideoIE(InfoExtractor):
2180 """Information Extractor for myvideo.de."""
2182 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2183 IE_NAME = u'myvideo'
2185 def __init__(self, downloader=None):
2186 InfoExtractor.__init__(self, downloader)
2188 def report_download_webpage(self, video_id):
2189 """Report webpage download."""
2190 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2192 def report_extraction(self, video_id):
2193 """Report information extraction."""
2194 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2196 def _real_extract(self,url):
2197 mobj = re.match(self._VALID_URL, url)
2199 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2202 video_id = mobj.group(1)
2205 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2207 self.report_download_webpage(video_id)
2208 webpage = urllib2.urlopen(request).read()
2209 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2210 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2213 self.report_extraction(video_id)
2214 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2217 self._downloader.trouble(u'ERROR: unable to extract media URL')
2219 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2221 mobj = re.search('<title>([^<]+)</title>', webpage)
2223 self._downloader.trouble(u'ERROR: unable to extract title')
2226 video_title = mobj.group(1)
2232 'upload_date': u'NA',
2233 'title': video_title,
2237 class ComedyCentralIE(InfoExtractor):
2238 """Information extractor for The Daily Show and Colbert Report """
2240 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2241 IE_NAME = u'comedycentral'
2243 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2245 _video_extensions = {
2253 _video_dimensions = {
2262 def report_extraction(self, episode_id):
2263 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2265 def report_config_download(self, episode_id):
2266 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2268 def report_index_download(self, episode_id):
2269 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2271 def report_player_url(self, episode_id):
2272 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2275 def _print_formats(self, formats):
2276 print('Available formats:')
2278 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2281 def _real_extract(self, url):
2282 mobj = re.match(self._VALID_URL, url)
2284 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2287 if mobj.group('shortname'):
2288 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2289 url = u'http://www.thedailyshow.com/full-episodes/'
2291 url = u'http://www.colbertnation.com/full-episodes/'
2292 mobj = re.match(self._VALID_URL, url)
2293 assert mobj is not None
2295 dlNewest = not mobj.group('episode')
2297 epTitle = mobj.group('showname')
2299 epTitle = mobj.group('episode')
2301 req = urllib2.Request(url)
2302 self.report_extraction(epTitle)
2304 htmlHandle = urllib2.urlopen(req)
2305 html = htmlHandle.read()
2306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2307 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2310 url = htmlHandle.geturl()
2311 mobj = re.match(self._VALID_URL, url)
2313 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2315 if mobj.group('episode') == '':
2316 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2318 epTitle = mobj.group('episode')
2320 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2322 if len(mMovieParams) == 0:
2323 # The Colbert Report embeds the information in a without
2324 # a URL prefix; so extract the alternate reference
2325 # and then add the URL prefix manually.
2327 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2328 if len(altMovieParams) == 0:
2329 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2332 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2334 playerUrl_raw = mMovieParams[0][0]
2335 self.report_player_url(epTitle)
2337 urlHandle = urllib2.urlopen(playerUrl_raw)
2338 playerUrl = urlHandle.geturl()
2339 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2340 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2343 uri = mMovieParams[0][1]
2344 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2345 self.report_index_download(epTitle)
2347 indexXml = urllib2.urlopen(indexUrl).read()
2348 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2349 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2354 idoc = xml.etree.ElementTree.fromstring(indexXml)
2355 itemEls = idoc.findall('.//item')
2356 for itemEl in itemEls:
2357 mediaId = itemEl.findall('./guid')[0].text
2358 shortMediaId = mediaId.split(':')[-1]
2359 showId = mediaId.split(':')[-2].replace('.com', '')
2360 officialTitle = itemEl.findall('./title')[0].text
2361 officialDate = itemEl.findall('./pubDate')[0].text
2363 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2364 urllib.urlencode({'uri': mediaId}))
2365 configReq = urllib2.Request(configUrl)
2366 self.report_config_download(epTitle)
2368 configXml = urllib2.urlopen(configReq).read()
2369 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2370 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2373 cdoc = xml.etree.ElementTree.fromstring(configXml)
2375 for rendition in cdoc.findall('.//rendition'):
2376 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2380 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2383 if self._downloader.params.get('listformats', None):
2384 self._print_formats([i[0] for i in turls])
2387 # For now, just pick the highest bitrate
2388 format,video_url = turls[-1]
2390 # Get the format arg from the arg stream
2391 req_format = self._downloader.params.get('format', None)
2393 # Select format if we can find one
2396 format, video_url = f, v
2399 # Patch to download from alternative CDN, which does not
2400 # break on current RTMPDump builds
2401 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2402 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2404 if video_url.startswith(broken_cdn):
2405 video_url = video_url.replace(broken_cdn, better_cdn)
2407 effTitle = showId + u'-' + epTitle
2412 'upload_date': officialDate,
2417 'description': officialTitle,
2418 'player_url': None #playerUrl
2421 results.append(info)
2426 class EscapistIE(InfoExtractor):
2427 """Information extractor for The Escapist """
2429 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2430 IE_NAME = u'escapist'
2432 def report_extraction(self, showName):
2433 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2435 def report_config_download(self, showName):
2436 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2438 def _real_extract(self, url):
2439 mobj = re.match(self._VALID_URL, url)
2441 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2443 showName = mobj.group('showname')
2444 videoId = mobj.group('episode')
2446 self.report_extraction(showName)
2448 webPage = urllib2.urlopen(url)
2449 webPageBytes = webPage.read()
2450 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2451 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2452 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2453 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2456 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2457 description = unescapeHTML(descMatch.group(1))
2458 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2459 imgUrl = unescapeHTML(imgMatch.group(1))
2460 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2461 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2462 configUrlMatch = re.search('config=(.*)$', playerUrl)
2463 configUrl = urllib2.unquote(configUrlMatch.group(1))
2465 self.report_config_download(showName)
2467 configJSON = urllib2.urlopen(configUrl).read()
2468 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2469 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2472 # Technically, it's JavaScript, not JSON
2473 configJSON = configJSON.replace("'", '"')
2476 config = json.loads(configJSON)
2477 except (ValueError,), err:
2478 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2481 playlist = config['playlist']
2482 videoUrl = playlist[1]['url']
2487 'uploader': showName,
2488 'upload_date': None,
2491 'thumbnail': imgUrl,
2492 'description': description,
2493 'player_url': playerUrl,
2499 class CollegeHumorIE(InfoExtractor):
2500 """Information extractor for collegehumor.com"""
2502 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2503 IE_NAME = u'collegehumor'
2505 def report_webpage(self, video_id):
2506 """Report information extraction."""
2507 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2509 def report_extraction(self, video_id):
2510 """Report information extraction."""
2511 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2513 def _real_extract(self, url):
2514 mobj = re.match(self._VALID_URL, url)
2516 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2518 video_id = mobj.group('videoid')
2520 self.report_webpage(video_id)
2521 request = urllib2.Request(url)
2523 webpage = urllib2.urlopen(request).read()
2524 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2525 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2528 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2530 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2532 internal_video_id = m.group('internalvideoid')
2536 'internal_id': internal_video_id,
2539 self.report_extraction(video_id)
2540 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2542 metaXml = urllib2.urlopen(xmlUrl).read()
2543 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2544 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2547 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2549 videoNode = mdoc.findall('./video')[0]
2550 info['description'] = videoNode.findall('./description')[0].text
2551 info['title'] = videoNode.findall('./caption')[0].text
2552 info['url'] = videoNode.findall('./file')[0].text
2553 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2554 info['ext'] = info['url'].rpartition('.')[2]
2556 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2562 class XVideosIE(InfoExtractor):
2563 """Information extractor for xvideos.com"""
2565 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2566 IE_NAME = u'xvideos'
2568 def report_webpage(self, video_id):
2569 """Report information extraction."""
2570 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2572 def report_extraction(self, video_id):
2573 """Report information extraction."""
2574 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2576 def _real_extract(self, url):
2577 mobj = re.match(self._VALID_URL, url)
2579 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2581 video_id = mobj.group(1).decode('utf-8')
2583 self.report_webpage(video_id)
2585 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2587 webpage = urllib2.urlopen(request).read()
2588 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2589 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2592 self.report_extraction(video_id)
2596 mobj = re.search(r'flv_url=(.+?)&', webpage)
2598 self._downloader.trouble(u'ERROR: unable to extract video url')
2600 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2604 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2606 self._downloader.trouble(u'ERROR: unable to extract video title')
2608 video_title = mobj.group(1).decode('utf-8')
2611 # Extract video thumbnail
2612 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2614 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2616 video_thumbnail = mobj.group(0).decode('utf-8')
2622 'upload_date': None,
2623 'title': video_title,
2625 'thumbnail': video_thumbnail,
2626 'description': None,
2632 class SoundcloudIE(InfoExtractor):
2633 """Information extractor for soundcloud.com
2634 To access the media, the uid of the song and a stream token
2635 must be extracted from the page source and the script must make
2636 a request to media.soundcloud.com/crossdomain.xml. Then
2637 the media can be grabbed by requesting from an url composed
2638 of the stream token and uid
2641 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2642 IE_NAME = u'soundcloud'
2644 def __init__(self, downloader=None):
2645 InfoExtractor.__init__(self, downloader)
2647 def report_webpage(self, video_id):
2648 """Report information extraction."""
2649 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2651 def report_extraction(self, video_id):
2652 """Report information extraction."""
2653 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2655 def _real_extract(self, url):
2656 mobj = re.match(self._VALID_URL, url)
2658 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2661 # extract uploader (which is in the url)
2662 uploader = mobj.group(1).decode('utf-8')
2663 # extract simple title (uploader + slug of song title)
2664 slug_title = mobj.group(2).decode('utf-8')
2665 simple_title = uploader + u'-' + slug_title
2667 self.report_webpage('%s/%s' % (uploader, slug_title))
2669 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2671 webpage = urllib2.urlopen(request).read()
2672 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2673 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2676 self.report_extraction('%s/%s' % (uploader, slug_title))
2678 # extract uid and stream token that soundcloud hands out for access
2679 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2681 video_id = mobj.group(1)
2682 stream_token = mobj.group(2)
2684 # extract unsimplified title
2685 mobj = re.search('"title":"(.*?)",', webpage)
2687 title = mobj.group(1).decode('utf-8')
2689 title = simple_title
2691 # construct media url (with uid/token)
2692 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2693 mediaURL = mediaURL % (video_id, stream_token)
2696 description = u'No description available'
2697 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2699 description = mobj.group(1)
2703 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2706 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2707 except Exception, e:
2708 self._downloader.to_stderr(compat_str(e))
2710 # for soundcloud, a request to a cross domain is required for cookies
2711 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2714 'id': video_id.decode('utf-8'),
2716 'uploader': uploader.decode('utf-8'),
2717 'upload_date': upload_date,
2720 'description': description.decode('utf-8')
2724 class InfoQIE(InfoExtractor):
2725 """Information extractor for infoq.com"""
2727 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2730 def report_webpage(self, video_id):
2731 """Report information extraction."""
2732 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2734 def report_extraction(self, video_id):
2735 """Report information extraction."""
2736 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2738 def _real_extract(self, url):
2739 mobj = re.match(self._VALID_URL, url)
2741 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2744 self.report_webpage(url)
2746 request = urllib2.Request(url)
2748 webpage = urllib2.urlopen(request).read()
2749 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2750 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2753 self.report_extraction(url)
2757 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2759 self._downloader.trouble(u'ERROR: unable to extract video url')
2761 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2765 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2767 self._downloader.trouble(u'ERROR: unable to extract video title')
2769 video_title = mobj.group(1).decode('utf-8')
2771 # Extract description
2772 video_description = u'No description available.'
2773 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2774 if mobj is not None:
2775 video_description = mobj.group(1).decode('utf-8')
2777 video_filename = video_url.split('/')[-1]
2778 video_id, extension = video_filename.split('.')
2784 'upload_date': None,
2785 'title': video_title,
2786 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2788 'description': video_description,
2793 class MixcloudIE(InfoExtractor):
2794 """Information extractor for www.mixcloud.com"""
2795 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2796 IE_NAME = u'mixcloud'
2798 def __init__(self, downloader=None):
2799 InfoExtractor.__init__(self, downloader)
2801 def report_download_json(self, file_id):
2802 """Report JSON download."""
2803 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2805 def report_extraction(self, file_id):
2806 """Report information extraction."""
2807 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2809 def get_urls(self, jsonData, fmt, bitrate='best'):
2810 """Get urls from 'audio_formats' section in json"""
2813 bitrate_list = jsonData[fmt]
2814 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2815 bitrate = max(bitrate_list) # select highest
2817 url_list = jsonData[fmt][bitrate]
2818 except TypeError: # we have no bitrate info.
2819 url_list = jsonData[fmt]
2822 def check_urls(self, url_list):
2823 """Returns 1st active url from list"""
2824 for url in url_list:
2826 urllib2.urlopen(url)
2828 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2833 def _print_formats(self, formats):
2834 print('Available formats:')
2835 for fmt in formats.keys():
2836 for b in formats[fmt]:
2838 ext = formats[fmt][b][0]
2839 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2840 except TypeError: # we have no bitrate info
2841 ext = formats[fmt][0]
2842 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2845 def _real_extract(self, url):
2846 mobj = re.match(self._VALID_URL, url)
2848 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2850 # extract uploader & filename from url
2851 uploader = mobj.group(1).decode('utf-8')
2852 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2854 # construct API request
2855 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2856 # retrieve .json file with links to files
2857 request = urllib2.Request(file_url)
2859 self.report_download_json(file_url)
2860 jsonData = urllib2.urlopen(request).read()
2861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2862 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2866 json_data = json.loads(jsonData)
2867 player_url = json_data['player_swf_url']
2868 formats = dict(json_data['audio_formats'])
2870 req_format = self._downloader.params.get('format', None)
2873 if self._downloader.params.get('listformats', None):
2874 self._print_formats(formats)
2877 if req_format is None or req_format == 'best':
2878 for format_param in formats.keys():
2879 url_list = self.get_urls(formats, format_param)
2881 file_url = self.check_urls(url_list)
2882 if file_url is not None:
2885 if req_format not in formats.keys():
2886 self._downloader.trouble(u'ERROR: format is not available')
2889 url_list = self.get_urls(formats, req_format)
2890 file_url = self.check_urls(url_list)
2891 format_param = req_format
2894 'id': file_id.decode('utf-8'),
2895 'url': file_url.decode('utf-8'),
2896 'uploader': uploader.decode('utf-8'),
2897 'upload_date': u'NA',
2898 'title': json_data['name'],
2899 'ext': file_url.split('.')[-1].decode('utf-8'),
2900 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2901 'thumbnail': json_data['thumbnail_url'],
2902 'description': json_data['description'],
2903 'player_url': player_url.decode('utf-8'),
2906 class StanfordOpenClassroomIE(InfoExtractor):
2907 """Information extractor for Stanford's Open ClassRoom"""
2909 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2910 IE_NAME = u'stanfordoc'
2912 def report_download_webpage(self, objid):
2913 """Report information extraction."""
2914 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2916 def report_extraction(self, video_id):
2917 """Report information extraction."""
2918 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2920 def _real_extract(self, url):
2921 mobj = re.match(self._VALID_URL, url)
2923 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2926 if mobj.group('course') and mobj.group('video'): # A specific video
2927 course = mobj.group('course')
2928 video = mobj.group('video')
2930 'id': course + '_' + video,
2933 self.report_extraction(info['id'])
2934 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2935 xmlUrl = baseUrl + video + '.xml'
2937 metaXml = urllib2.urlopen(xmlUrl).read()
2938 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2939 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2941 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2943 info['title'] = mdoc.findall('./title')[0].text
2944 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2946 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2948 info['ext'] = info['url'].rpartition('.')[2]
2950 elif mobj.group('course'): # A course page
2951 course = mobj.group('course')
2957 self.report_download_webpage(info['id'])
2959 coursepage = urllib2.urlopen(url).read()
2960 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2961 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2964 m = re.search('<h1>([^<]+)</h1>', coursepage)
2966 info['title'] = unescapeHTML(m.group(1))
2968 info['title'] = info['id']
2970 m = re.search('<description>([^<]+)</description>', coursepage)
2972 info['description'] = unescapeHTML(m.group(1))
2974 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2977 'type': 'reference',
2978 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2982 for entry in info['list']:
2983 assert entry['type'] == 'reference'
2984 results += self.extract(entry['url'])
2989 'id': 'Stanford OpenClassroom',
2993 self.report_download_webpage(info['id'])
2994 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2996 rootpage = urllib2.urlopen(rootURL).read()
2997 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2998 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3001 info['title'] = info['id']
3003 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3006 'type': 'reference',
3007 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3012 for entry in info['list']:
3013 assert entry['type'] == 'reference'
3014 results += self.extract(entry['url'])
3017 class MTVIE(InfoExtractor):
3018 """Information extractor for MTV.com"""
3020 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3023 def report_webpage(self, video_id):
3024 """Report information extraction."""
3025 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3027 def report_extraction(self, video_id):
3028 """Report information extraction."""
3029 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3031 def _real_extract(self, url):
3032 mobj = re.match(self._VALID_URL, url)
3034 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3036 if not mobj.group('proto'):
3037 url = 'http://' + url
3038 video_id = mobj.group('videoid')
3039 self.report_webpage(video_id)
3041 request = urllib2.Request(url)
3043 webpage = urllib2.urlopen(request).read()
3044 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3045 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3048 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3050 self._downloader.trouble(u'ERROR: unable to extract song name')
3052 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3053 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3055 self._downloader.trouble(u'ERROR: unable to extract performer')
3057 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3058 video_title = performer + ' - ' + song_name
3060 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3062 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3064 mtvn_uri = mobj.group(1)
3066 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3068 self._downloader.trouble(u'ERROR: unable to extract content id')
3070 content_id = mobj.group(1)
3072 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3073 self.report_extraction(video_id)
3074 request = urllib2.Request(videogen_url)
3076 metadataXml = urllib2.urlopen(request).read()
3077 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3078 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3081 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3082 renditions = mdoc.findall('.//rendition')
3084 # For now, always pick the highest quality.
3085 rendition = renditions[-1]
3088 _,_,ext = rendition.attrib['type'].partition('/')
3089 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3090 video_url = rendition.find('./src').text
3092 self._downloader.trouble('Invalid rendition field.')
3098 'uploader': performer,
3099 'title': video_title,
3107 class YoukuIE(InfoExtractor):
3109 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3112 def __init__(self, downloader=None):
3113 InfoExtractor.__init__(self, downloader)
3115 def report_download_webpage(self, file_id):
3116 """Report webpage download."""
3117 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3119 def report_extraction(self, file_id):
3120 """Report information extraction."""
3121 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3124 nowTime = int(time.time() * 1000)
3125 random1 = random.randint(1000,1998)
3126 random2 = random.randint(1000,9999)
3128 return "%d%d%d" %(nowTime,random1,random2)
3130 def _get_file_ID_mix_string(self, seed):
3132 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3134 for i in range(len(source)):
3135 seed = (seed * 211 + 30031 ) % 65536
3136 index = math.floor(seed / 65536 * len(source) )
3137 mixed.append(source[int(index)])
3138 source.remove(source[int(index)])
3139 #return ''.join(mixed)
3142 def _get_file_id(self, fileId, seed):
3143 mixed = self._get_file_ID_mix_string(seed)
3144 ids = fileId.split('*')
3148 realId.append(mixed[int(ch)])
3149 return ''.join(realId)
3151 def _real_extract(self, url):
3152 mobj = re.match(self._VALID_URL, url)
3154 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3156 video_id = mobj.group('ID')
3158 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3160 request = urllib2.Request(info_url, None, std_headers)
3162 self.report_download_webpage(video_id)
3163 jsondata = urllib2.urlopen(request).read()
3164 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3165 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3168 self.report_extraction(video_id)
3170 config = json.loads(jsondata)
3172 video_title = config['data'][0]['title']
3173 seed = config['data'][0]['seed']
3175 format = self._downloader.params.get('format', None)
3176 supported_format = config['data'][0]['streamfileids'].keys()
3178 if format is None or format == 'best':
3179 if 'hd2' in supported_format:
3184 elif format == 'worst':
3192 fileid = config['data'][0]['streamfileids'][format]
3193 seg_number = len(config['data'][0]['segs'][format])
3196 for i in xrange(seg_number):
3197 keys.append(config['data'][0]['segs'][format][i]['k'])
3200 #youku only could be viewed from mainland china
3202 self._downloader.trouble(u'ERROR: unable to extract info section')
3206 sid = self._gen_sid()
3207 fileid = self._get_file_id(fileid, seed)
3209 #column 8,9 of fileid represent the segment number
3210 #fileid[7:9] should be changed
3211 for index, key in enumerate(keys):
3213 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3214 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3217 'id': '%s_part%02d' % (video_id, index),
3218 'url': download_url,
3220 'title': video_title,
3223 files_info.append(info)
3228 class XNXXIE(InfoExtractor):
3229 """Information extractor for xnxx.com"""
3231 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3233 VIDEO_URL_RE = r'flv_url=(.*?)&'
3234 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3235 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3237 def report_webpage(self, video_id):
3238 """Report information extraction"""
3239 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3241 def report_extraction(self, video_id):
3242 """Report information extraction"""
3243 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3245 def _real_extract(self, url):
3246 mobj = re.match(self._VALID_URL, url)
3248 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3250 video_id = mobj.group(1).decode('utf-8')
3252 self.report_webpage(video_id)
3254 # Get webpage content
3256 webpage = urllib2.urlopen(url).read()
3257 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3258 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3261 result = re.search(self.VIDEO_URL_RE, webpage)
3263 self._downloader.trouble(u'ERROR: unable to extract video url')
3265 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3267 result = re.search(self.VIDEO_TITLE_RE, webpage)
3269 self._downloader.trouble(u'ERROR: unable to extract video title')
3271 video_title = result.group(1).decode('utf-8')
3273 result = re.search(self.VIDEO_THUMB_RE, webpage)
3275 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3277 video_thumbnail = result.group(1).decode('utf-8')
3283 'upload_date': None,
3284 'title': video_title,
3286 'thumbnail': video_thumbnail,
3287 'description': None,
3291 class GooglePlusIE(InfoExtractor):
3292 """Information extractor for plus.google.com."""
3294 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3295 IE_NAME = u'plus.google'
3297 def __init__(self, downloader=None):
3298 InfoExtractor.__init__(self, downloader)
3300 def report_extract_entry(self, url):
3301 """Report downloading extry"""
3302 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3304 def report_date(self, upload_date):
3305 """Report downloading extry"""
3306 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3308 def report_uploader(self, uploader):
3309 """Report downloading extry"""
3310 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3312 def report_title(self, video_title):
3313 """Report downloading extry"""
3314 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3316 def report_extract_vid_page(self, video_page):
3317 """Report information extraction."""
3318 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3320 def _real_extract(self, url):
3321 # Extract id from URL
3322 mobj = re.match(self._VALID_URL, url)
3324 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3327 post_url = mobj.group(0)
3328 video_id = mobj.group(2)
3330 video_extension = 'flv'
3332 # Step 1, Retrieve post webpage to extract further information
3333 self.report_extract_entry(post_url)
3334 request = urllib2.Request(post_url)
3336 webpage = urllib2.urlopen(request).read()
3337 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3338 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3341 # Extract update date
3343 pattern = 'title="Timestamp">(.*?)</a>'
3344 mobj = re.search(pattern, webpage)
3346 upload_date = mobj.group(1)
3347 # Convert timestring to a format suitable for filename
3348 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3349 upload_date = upload_date.strftime('%Y%m%d')
3350 self.report_date(upload_date)
3354 pattern = r'rel\="author".*?>(.*?)</a>'
3355 mobj = re.search(pattern, webpage)
3357 uploader = mobj.group(1)
3358 self.report_uploader(uploader)
3361 # Get the first line for title
3363 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3364 mobj = re.search(pattern, webpage)
3366 video_title = mobj.group(1)
3367 self.report_title(video_title)
3369 # Step 2, Stimulate clicking the image box to launch video
3370 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3371 mobj = re.search(pattern, webpage)
3373 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3375 video_page = mobj.group(1)
3376 request = urllib2.Request(video_page)
3378 webpage = urllib2.urlopen(request).read()
3379 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3380 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3382 self.report_extract_vid_page(video_page)
3385 # Extract video links on video page
3386 """Extract video links of all sizes"""
3387 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3388 mobj = re.findall(pattern, webpage)
3390 self._downloader.trouble(u'ERROR: unable to extract video links')
3392 # Sort in resolution
3393 links = sorted(mobj)
3395 # Choose the lowest of the sort, i.e. highest resolution
3396 video_url = links[-1]
3397 # Only get the url. The resolution part in the tuple has no use anymore
3398 video_url = video_url[-1]
3399 # Treat escaped \u0026 style hex
3400 video_url = unicode(video_url, "unicode_escape")
3404 'id': video_id.decode('utf-8'),
3406 'uploader': uploader.decode('utf-8'),
3407 'upload_date': upload_date.decode('utf-8'),
3408 'title': video_title.decode('utf-8'),
3409 'ext': video_extension.decode('utf-8'),