2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
42 uploader: Nickname of the video uploader.
44 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
67 self.set_downloader(downloader)
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
74 """Initializes an instance (authentication, etc)."""
76 self._real_initialize()
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
82 return self._real_extract(url)
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
102 (?:https?://)? # http(s):// (optional)
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
105 (?:.*?\#/)? # handle anchor (#/) redirect urls
106 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
107 (?: # the various things that can precede the ID:
108 (?:(?:v|embed|e)/) # v/ or embed/ or e/
109 |(?: # or the v= param in all its forms
110 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
111 (?:\?|\#!?) # the params delimiter ? or # or #!
112 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
115 )? # optional -> youtube.com/xxxx is OK
116 )? # all until now is optional -> you can pass the naked ID
117 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
118 (?(1).+)? # if we found the ID, everything can follow
120 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
121 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
122 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
123 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
124 _NETRC_MACHINE = 'youtube'
125 # Listed in order of quality
126 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
127 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
128 _video_extensions = {
134 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
140 _video_dimensions = {
158 def suitable(self, url):
159 """Receives a URL and returns True if suitable for this IE."""
160 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
162 def report_lang(self):
163 """Report attempt to set language."""
164 self._downloader.to_screen(u'[youtube] Setting language')
166 def report_login(self):
167 """Report attempt to log in."""
168 self._downloader.to_screen(u'[youtube] Logging in')
170 def report_age_confirmation(self):
171 """Report attempt to confirm age."""
172 self._downloader.to_screen(u'[youtube] Confirming age')
174 def report_video_webpage_download(self, video_id):
175 """Report attempt to download video webpage."""
176 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
178 def report_video_info_webpage_download(self, video_id):
179 """Report attempt to download video info webpage."""
180 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
182 def report_video_subtitles_download(self, video_id):
183 """Report attempt to download video info webpage."""
184 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
186 def report_information_extraction(self, video_id):
187 """Report attempt to extract video information."""
188 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
190 def report_unavailable_format(self, video_id, format):
191 """Report extracted video URL."""
192 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
194 def report_rtmp_download(self):
195 """Indicate the download will use the RTMP protocol."""
196 self._downloader.to_screen(u'[youtube] RTMP download detected')
198 def _closed_captions_xml_to_srt(self, xml_string):
200 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
201 # TODO parse xml instead of regex
202 for n, (start, dur_tag, dur, caption) in enumerate(texts):
203 if not dur: dur = '4'
205 end = start + float(dur)
206 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
207 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
208 caption = unescapeHTML(caption)
209 caption = unescapeHTML(caption) # double cycle, intentional
210 srt += str(n+1) + '\n'
211 srt += start + ' --> ' + end + '\n'
212 srt += caption + '\n\n'
215 def _print_formats(self, formats):
216 print('Available formats:')
218 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
220 def _real_initialize(self):
221 if self._downloader is None:
226 downloader_params = self._downloader.params
228 # Attempt to use provided username and password or .netrc data
229 if downloader_params.get('username', None) is not None:
230 username = downloader_params['username']
231 password = downloader_params['password']
232 elif downloader_params.get('usenetrc', False):
234 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
239 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
240 except (IOError, netrc.NetrcParseError), err:
241 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
245 request = urllib2.Request(self._LANG_URL)
248 urllib2.urlopen(request).read()
249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
250 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
253 # No authentication to be performed
259 'current_form': 'loginForm',
261 'action_login': 'Log In',
262 'username': username,
263 'password': password,
265 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
268 login_results = urllib2.urlopen(request).read()
269 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
270 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
273 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
279 'action_confirm': 'Confirm',
281 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
283 self.report_age_confirmation()
284 age_results = urllib2.urlopen(request).read()
285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
286 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
289 def _real_extract(self, url):
290 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
291 mobj = re.search(self._NEXT_URL_RE, url)
293 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
295 # Extract video id from URL
296 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
298 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
300 video_id = mobj.group(2)
303 self.report_video_webpage_download(video_id)
304 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
306 video_webpage = urllib2.urlopen(request).read()
307 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
308 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
311 # Attempt to extract SWF player URL
312 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
314 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
319 self.report_video_info_webpage_download(video_id)
320 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
321 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
322 % (video_id, el_type))
323 request = urllib2.Request(video_info_url)
325 video_info_webpage = urllib2.urlopen(request).read()
326 video_info = parse_qs(video_info_webpage)
327 if 'token' in video_info:
329 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
330 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
332 if 'token' not in video_info:
333 if 'reason' in video_info:
334 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
336 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
339 # Check for "rental" videos
340 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
341 self._downloader.trouble(u'ERROR: "rental" videos not supported')
344 # Start extracting information
345 self.report_information_extraction(video_id)
348 if 'author' not in video_info:
349 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
351 video_uploader = urllib.unquote_plus(video_info['author'][0])
354 if 'title' not in video_info:
355 self._downloader.trouble(u'ERROR: unable to extract video title')
357 video_title = urllib.unquote_plus(video_info['title'][0])
358 video_title = video_title.decode('utf-8')
361 if 'thumbnail_url' not in video_info:
362 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
364 else: # don't panic if we can't find it
365 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
372 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
373 for expression in format_expressions:
375 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
380 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
381 if video_description: video_description = clean_html(video_description)
382 else: video_description = ''
385 video_subtitles = None
386 if self._downloader.params.get('writesubtitles', False):
388 self.report_video_subtitles_download(video_id)
389 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
391 srt_list = urllib2.urlopen(request).read()
392 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
393 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
394 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
395 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
396 if not srt_lang_list:
397 raise Trouble(u'WARNING: video has no closed captions')
398 if self._downloader.params.get('subtitleslang', False):
399 srt_lang = self._downloader.params.get('subtitleslang')
400 elif 'en' in srt_lang_list:
403 srt_lang = srt_lang_list.keys()[0]
404 if not srt_lang in srt_lang_list:
405 raise Trouble(u'WARNING: no closed captions found in the specified language')
406 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
408 srt_xml = urllib2.urlopen(request).read()
409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
410 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
412 raise Trouble(u'WARNING: unable to download video subtitles')
413 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
414 except Trouble as trouble:
415 self._downloader.trouble(trouble[0])
417 if 'length_seconds' not in video_info:
418 self._downloader.trouble(u'WARNING: unable to extract video duration')
421 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
424 video_token = urllib.unquote_plus(video_info['token'][0])
426 # Decide which formats to download
427 req_format = self._downloader.params.get('format', None)
429 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
430 self.report_rtmp_download()
431 video_url_list = [(None, video_info['conn'][0])]
432 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
433 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
434 url_data = [parse_qs(uds) for uds in url_data_strs]
435 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
436 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
438 format_limit = self._downloader.params.get('format_limit', None)
439 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
440 if format_limit is not None and format_limit in available_formats:
441 format_list = available_formats[available_formats.index(format_limit):]
443 format_list = available_formats
444 existing_formats = [x for x in format_list if x in url_map]
445 if len(existing_formats) == 0:
446 self._downloader.trouble(u'ERROR: no known formats available for video')
448 if self._downloader.params.get('listformats', None):
449 self._print_formats(existing_formats)
451 if req_format is None or req_format == 'best':
452 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
453 elif req_format == 'worst':
454 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
455 elif req_format in ('-1', 'all'):
456 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
458 # Specific formats. We pick the first in a slash-delimeted sequence.
459 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
460 req_formats = req_format.split('/')
461 video_url_list = None
462 for rf in req_formats:
464 video_url_list = [(rf, url_map[rf])]
466 if video_url_list is None:
467 self._downloader.trouble(u'ERROR: requested format not available')
470 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
474 for format_param, video_real_url in video_url_list:
476 video_extension = self._video_extensions.get(format_param, 'flv')
479 'id': video_id.decode('utf-8'),
480 'url': video_real_url.decode('utf-8'),
481 'uploader': video_uploader.decode('utf-8'),
482 'upload_date': upload_date,
483 'title': video_title,
484 'ext': video_extension.decode('utf-8'),
485 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
486 'thumbnail': video_thumbnail.decode('utf-8'),
487 'description': video_description,
488 'player_url': player_url,
489 'subtitles': video_subtitles,
490 'duration': video_duration
495 class MetacafeIE(InfoExtractor):
496 """Information Extractor for metacafe.com."""
498 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
499 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
500 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
501 IE_NAME = u'metacafe'
503 def __init__(self, downloader=None):
504 InfoExtractor.__init__(self, downloader)
506 def report_disclaimer(self):
507 """Report disclaimer retrieval."""
508 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
510 def report_age_confirmation(self):
511 """Report attempt to confirm age."""
512 self._downloader.to_screen(u'[metacafe] Confirming age')
514 def report_download_webpage(self, video_id):
515 """Report webpage download."""
516 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
518 def report_extraction(self, video_id):
519 """Report information extraction."""
520 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
522 def _real_initialize(self):
523 # Retrieve disclaimer
524 request = urllib2.Request(self._DISCLAIMER)
526 self.report_disclaimer()
527 disclaimer = urllib2.urlopen(request).read()
528 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
529 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
535 'submit': "Continue - I'm over 18",
537 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
539 self.report_age_confirmation()
540 disclaimer = urllib2.urlopen(request).read()
541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
542 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
545 def _real_extract(self, url):
546 # Extract id and simplified title from URL
547 mobj = re.match(self._VALID_URL, url)
549 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
552 video_id = mobj.group(1)
554 # Check if video comes from YouTube
555 mobj2 = re.match(r'^yt-(.*)$', video_id)
556 if mobj2 is not None:
557 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
560 # Retrieve video webpage to extract further information
561 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
563 self.report_download_webpage(video_id)
564 webpage = urllib2.urlopen(request).read()
565 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
566 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
569 # Extract URL, uploader and title from webpage
570 self.report_extraction(video_id)
571 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
573 mediaURL = urllib.unquote(mobj.group(1))
574 video_extension = mediaURL[-3:]
576 # Extract gdaKey if available
577 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
581 gdaKey = mobj.group(1)
582 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
584 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
588 vardict = parse_qs(mobj.group(1))
589 if 'mediaData' not in vardict:
590 self._downloader.trouble(u'ERROR: unable to extract media URL')
592 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
594 self._downloader.trouble(u'ERROR: unable to extract media URL')
596 mediaURL = mobj.group(1).replace('\\/', '/')
597 video_extension = mediaURL[-3:]
598 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
600 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
602 self._downloader.trouble(u'ERROR: unable to extract title')
604 video_title = mobj.group(1).decode('utf-8')
606 mobj = re.search(r'submitter=(.*?);', webpage)
608 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
610 video_uploader = mobj.group(1)
613 'id': video_id.decode('utf-8'),
614 'url': video_url.decode('utf-8'),
615 'uploader': video_uploader.decode('utf-8'),
616 'upload_date': u'NA',
617 'title': video_title,
618 'ext': video_extension.decode('utf-8'),
624 class DailymotionIE(InfoExtractor):
625 """Information Extractor for Dailymotion"""
627 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
628 IE_NAME = u'dailymotion'
630 def __init__(self, downloader=None):
631 InfoExtractor.__init__(self, downloader)
633 def report_download_webpage(self, video_id):
634 """Report webpage download."""
635 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
637 def report_extraction(self, video_id):
638 """Report information extraction."""
639 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
641 def _real_extract(self, url):
642 # Extract id and simplified title from URL
643 mobj = re.match(self._VALID_URL, url)
645 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
648 video_id = mobj.group(1).split('_')[0].split('?')[0]
650 video_extension = 'mp4'
652 # Retrieve video webpage to extract further information
653 request = urllib2.Request(url)
654 request.add_header('Cookie', 'family_filter=off')
656 self.report_download_webpage(video_id)
657 webpage = urllib2.urlopen(request).read()
658 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
659 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
662 # Extract URL, uploader and title from webpage
663 self.report_extraction(video_id)
664 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
666 self._downloader.trouble(u'ERROR: unable to extract media URL')
668 flashvars = urllib.unquote(mobj.group(1))
670 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
673 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
676 self._downloader.trouble(u'ERROR: unable to extract video URL')
679 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
681 self._downloader.trouble(u'ERROR: unable to extract video URL')
684 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
686 # TODO: support choosing qualities
688 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
690 self._downloader.trouble(u'ERROR: unable to extract title')
692 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
694 video_uploader = u'NA'
695 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
697 # lookin for official user
698 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
699 if mobj_official is None:
700 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
702 video_uploader = mobj_official.group(1)
704 video_uploader = mobj.group(1)
706 video_upload_date = u'NA'
707 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
709 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
712 'id': video_id.decode('utf-8'),
713 'url': video_url.decode('utf-8'),
714 'uploader': video_uploader.decode('utf-8'),
715 'upload_date': video_upload_date,
716 'title': video_title,
717 'ext': video_extension.decode('utf-8'),
723 class GoogleIE(InfoExtractor):
724 """Information extractor for video.google.com."""
726 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
727 IE_NAME = u'video.google'
729 def __init__(self, downloader=None):
730 InfoExtractor.__init__(self, downloader)
732 def report_download_webpage(self, video_id):
733 """Report webpage download."""
734 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
736 def report_extraction(self, video_id):
737 """Report information extraction."""
738 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
740 def _real_extract(self, url):
741 # Extract id from URL
742 mobj = re.match(self._VALID_URL, url)
744 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
747 video_id = mobj.group(1)
749 video_extension = 'mp4'
751 # Retrieve video webpage to extract further information
752 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
754 self.report_download_webpage(video_id)
755 webpage = urllib2.urlopen(request).read()
756 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
757 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
760 # Extract URL, uploader, and title from webpage
761 self.report_extraction(video_id)
762 mobj = re.search(r"download_url:'([^']+)'", webpage)
764 video_extension = 'flv'
765 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
767 self._downloader.trouble(u'ERROR: unable to extract media URL')
769 mediaURL = urllib.unquote(mobj.group(1))
770 mediaURL = mediaURL.replace('\\x3d', '\x3d')
771 mediaURL = mediaURL.replace('\\x26', '\x26')
775 mobj = re.search(r'<title>(.*)</title>', webpage)
777 self._downloader.trouble(u'ERROR: unable to extract title')
779 video_title = mobj.group(1).decode('utf-8')
781 # Extract video description
782 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
784 self._downloader.trouble(u'ERROR: unable to extract video description')
786 video_description = mobj.group(1).decode('utf-8')
787 if not video_description:
788 video_description = 'No description available.'
790 # Extract video thumbnail
791 if self._downloader.params.get('forcethumbnail', False):
792 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
794 webpage = urllib2.urlopen(request).read()
795 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
796 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
798 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
800 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
802 video_thumbnail = mobj.group(1)
803 else: # we need something to pass to process_info
807 'id': video_id.decode('utf-8'),
808 'url': video_url.decode('utf-8'),
810 'upload_date': u'NA',
811 'title': video_title,
812 'ext': video_extension.decode('utf-8'),
818 class PhotobucketIE(InfoExtractor):
819 """Information extractor for photobucket.com."""
821 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
822 IE_NAME = u'photobucket'
824 def __init__(self, downloader=None):
825 InfoExtractor.__init__(self, downloader)
827 def report_download_webpage(self, video_id):
828 """Report webpage download."""
829 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
831 def report_extraction(self, video_id):
832 """Report information extraction."""
833 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
835 def _real_extract(self, url):
836 # Extract id from URL
837 mobj = re.match(self._VALID_URL, url)
839 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
842 video_id = mobj.group(1)
844 video_extension = 'flv'
846 # Retrieve video webpage to extract further information
847 request = urllib2.Request(url)
849 self.report_download_webpage(video_id)
850 webpage = urllib2.urlopen(request).read()
851 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
852 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
855 # Extract URL, uploader, and title from webpage
856 self.report_extraction(video_id)
857 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
859 self._downloader.trouble(u'ERROR: unable to extract media URL')
861 mediaURL = urllib.unquote(mobj.group(1))
865 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
867 self._downloader.trouble(u'ERROR: unable to extract title')
869 video_title = mobj.group(1).decode('utf-8')
871 video_uploader = mobj.group(2).decode('utf-8')
874 'id': video_id.decode('utf-8'),
875 'url': video_url.decode('utf-8'),
876 'uploader': video_uploader,
877 'upload_date': u'NA',
878 'title': video_title,
879 'ext': video_extension.decode('utf-8'),
885 class YahooIE(InfoExtractor):
886 """Information extractor for video.yahoo.com."""
888 # _VALID_URL matches all Yahoo! Video URLs
889 # _VPAGE_URL matches only the extractable '/watch/' URLs
890 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
891 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
892 IE_NAME = u'video.yahoo'
894 def __init__(self, downloader=None):
895 InfoExtractor.__init__(self, downloader)
897 def report_download_webpage(self, video_id):
898 """Report webpage download."""
899 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
901 def report_extraction(self, video_id):
902 """Report information extraction."""
903 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
905 def _real_extract(self, url, new_video=True):
906 # Extract ID from URL
907 mobj = re.match(self._VALID_URL, url)
909 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
912 video_id = mobj.group(2)
913 video_extension = 'flv'
915 # Rewrite valid but non-extractable URLs as
916 # extractable English language /watch/ URLs
917 if re.match(self._VPAGE_URL, url) is None:
918 request = urllib2.Request(url)
920 webpage = urllib2.urlopen(request).read()
921 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
922 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
925 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
927 self._downloader.trouble(u'ERROR: Unable to extract id field')
929 yahoo_id = mobj.group(1)
931 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
933 self._downloader.trouble(u'ERROR: Unable to extract vid field')
935 yahoo_vid = mobj.group(1)
937 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
938 return self._real_extract(url, new_video=False)
940 # Retrieve video webpage to extract further information
941 request = urllib2.Request(url)
943 self.report_download_webpage(video_id)
944 webpage = urllib2.urlopen(request).read()
945 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
946 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
949 # Extract uploader and title from webpage
950 self.report_extraction(video_id)
951 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
953 self._downloader.trouble(u'ERROR: unable to extract video title')
955 video_title = mobj.group(1).decode('utf-8')
957 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
959 self._downloader.trouble(u'ERROR: unable to extract video uploader')
961 video_uploader = mobj.group(1).decode('utf-8')
963 # Extract video thumbnail
964 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
966 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
968 video_thumbnail = mobj.group(1).decode('utf-8')
970 # Extract video description
971 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
973 self._downloader.trouble(u'ERROR: unable to extract video description')
975 video_description = mobj.group(1).decode('utf-8')
976 if not video_description:
977 video_description = 'No description available.'
979 # Extract video height and width
980 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
982 self._downloader.trouble(u'ERROR: unable to extract video height')
984 yv_video_height = mobj.group(1)
986 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
988 self._downloader.trouble(u'ERROR: unable to extract video width')
990 yv_video_width = mobj.group(1)
992 # Retrieve video playlist to extract media URL
993 # I'm not completely sure what all these options are, but we
994 # seem to need most of them, otherwise the server sends a 401.
995 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
996 yv_bitrate = '700' # according to Wikipedia this is hard-coded
997 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
998 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
999 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1001 self.report_download_webpage(video_id)
1002 webpage = urllib2.urlopen(request).read()
1003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1007 # Extract media URL from playlist XML
1008 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1010 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1012 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1013 video_url = unescapeHTML(video_url)
1016 'id': video_id.decode('utf-8'),
1018 'uploader': video_uploader,
1019 'upload_date': u'NA',
1020 'title': video_title,
1021 'ext': video_extension.decode('utf-8'),
1022 'thumbnail': video_thumbnail.decode('utf-8'),
1023 'description': video_description,
1024 'thumbnail': video_thumbnail,
1029 class VimeoIE(InfoExtractor):
1030 """Information extractor for vimeo.com."""
1032 # _VALID_URL matches Vimeo URLs
1033 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1036 def __init__(self, downloader=None):
1037 InfoExtractor.__init__(self, downloader)
1039 def report_download_webpage(self, video_id):
1040 """Report webpage download."""
1041 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1043 def report_extraction(self, video_id):
1044 """Report information extraction."""
1045 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1047 def _real_extract(self, url, new_video=True):
1048 # Extract ID from URL
1049 mobj = re.match(self._VALID_URL, url)
1051 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1054 video_id = mobj.group(1)
1056 # Retrieve video webpage to extract further information
1057 request = urllib2.Request(url, None, std_headers)
1059 self.report_download_webpage(video_id)
1060 webpage = urllib2.urlopen(request).read()
1061 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1062 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1065 # Now we begin extracting as much information as we can from what we
1066 # retrieved. First we extract the information common to all extractors,
1067 # and latter we extract those that are Vimeo specific.
1068 self.report_extraction(video_id)
1070 # Extract the config JSON
1071 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1073 config = json.loads(config)
1075 self._downloader.trouble(u'ERROR: unable to extract info section')
1079 video_title = config["video"]["title"]
1082 video_uploader = config["video"]["owner"]["name"]
1084 # Extract video thumbnail
1085 video_thumbnail = config["video"]["thumbnail"]
1087 # Extract video description
1088 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089 if video_description: video_description = clean_html(video_description)
1090 else: video_description = ''
1092 # Extract upload date
1093 video_upload_date = u'NA'
1094 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095 if mobj is not None:
1096 video_upload_date = mobj.group(1)
1098 # Vimeo specific: extract request signature and timestamp
1099 sig = config['request']['signature']
1100 timestamp = config['request']['timestamp']
1102 # Vimeo specific: extract video codec and quality information
1103 # First consider quality, then codecs, then take everything
1104 # TODO bind to format param
1105 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106 files = { 'hd': [], 'sd': [], 'other': []}
1107 for codec_name, codec_extension in codecs:
1108 if codec_name in config["video"]["files"]:
1109 if 'hd' in config["video"]["files"][codec_name]:
1110 files['hd'].append((codec_name, codec_extension, 'hd'))
1111 elif 'sd' in config["video"]["files"][codec_name]:
1112 files['sd'].append((codec_name, codec_extension, 'sd'))
1114 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1116 for quality in ('hd', 'sd', 'other'):
1117 if len(files[quality]) > 0:
1118 video_quality = files[quality][0][2]
1119 video_codec = files[quality][0][0]
1120 video_extension = files[quality][0][1]
1121 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1124 self._downloader.trouble(u'ERROR: no known codec found')
1127 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1133 'uploader': video_uploader,
1134 'upload_date': video_upload_date,
1135 'title': video_title,
1136 'ext': video_extension,
1137 'thumbnail': video_thumbnail,
1138 'description': video_description,
1143 class GenericIE(InfoExtractor):
1144 """Generic last-resort information extractor."""
1147 IE_NAME = u'generic'
1149 def __init__(self, downloader=None):
1150 InfoExtractor.__init__(self, downloader)
1152 def report_download_webpage(self, video_id):
1153 """Report webpage download."""
1154 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1155 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1157 def report_extraction(self, video_id):
1158 """Report information extraction."""
1159 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1161 def report_following_redirect(self, new_url):
1162 """Report information extraction."""
1163 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1165 def _test_redirect(self, url):
1166 """Check if it is a redirect, like url shorteners, in case restart chain."""
1167 class HeadRequest(urllib2.Request):
1168 def get_method(self):
1171 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1173 Subclass the HTTPRedirectHandler to make it use our
1174 HeadRequest also on the redirected URL
1176 def redirect_request(self, req, fp, code, msg, headers, newurl):
1177 if code in (301, 302, 303, 307):
1178 newurl = newurl.replace(' ', '%20')
1179 newheaders = dict((k,v) for k,v in req.headers.items()
1180 if k.lower() not in ("content-length", "content-type"))
1181 return HeadRequest(newurl,
1183 origin_req_host=req.get_origin_req_host(),
1186 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1188 class HTTPMethodFallback(urllib2.BaseHandler):
1190 Fallback to GET if HEAD is not allowed (405 HTTP error)
1192 def http_error_405(self, req, fp, code, msg, headers):
1196 newheaders = dict((k,v) for k,v in req.headers.items()
1197 if k.lower() not in ("content-length", "content-type"))
1198 return self.parent.open(urllib2.Request(req.get_full_url(),
1200 origin_req_host=req.get_origin_req_host(),
1204 opener = urllib2.OpenerDirector()
1205 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1206 HTTPMethodFallback, HEADRedirectHandler,
1207 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1208 opener.add_handler(handler())
1210 response = opener.open(HeadRequest(url))
1211 new_url = response.geturl()
1213 if url == new_url: return False
1215 self.report_following_redirect(new_url)
1216 self._downloader.download([new_url])
1219 def _real_extract(self, url):
1220 if self._test_redirect(url): return
1222 video_id = url.split('/')[-1]
1223 request = urllib2.Request(url)
1225 self.report_download_webpage(video_id)
1226 webpage = urllib2.urlopen(request).read()
1227 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1230 except ValueError, err:
1231 # since this is the last-resort InfoExtractor, if
1232 # this error is thrown, it'll be thrown here
1233 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1236 self.report_extraction(video_id)
1237 # Start with something easy: JW Player in SWFObject
1238 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1240 # Broaden the search a little bit
1241 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1243 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1246 # It's possible that one of the regexes
1247 # matched, but returned an empty group:
1248 if mobj.group(1) is None:
1249 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1252 video_url = urllib.unquote(mobj.group(1))
1253 video_id = os.path.basename(video_url)
1255 # here's a fun little line of code for you:
1256 video_extension = os.path.splitext(video_id)[1][1:]
1257 video_id = os.path.splitext(video_id)[0]
1259 # it's tempting to parse this further, but you would
1260 # have to take into account all the variations like
1261 # Video Title - Site Name
1262 # Site Name | Video Title
1263 # Video Title - Tagline | Site Name
1264 # and so on and so forth; it's just not practical
1265 mobj = re.search(r'<title>(.*)</title>', webpage)
1267 self._downloader.trouble(u'ERROR: unable to extract title')
1269 video_title = mobj.group(1).decode('utf-8')
1271 # video uploader is domain name
1272 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1274 self._downloader.trouble(u'ERROR: unable to extract title')
1276 video_uploader = mobj.group(1).decode('utf-8')
1279 'id': video_id.decode('utf-8'),
1280 'url': video_url.decode('utf-8'),
1281 'uploader': video_uploader,
1282 'upload_date': u'NA',
1283 'title': video_title,
1284 'ext': video_extension.decode('utf-8'),
1290 class YoutubeSearchIE(InfoExtractor):
1291 """Information Extractor for YouTube search queries."""
1292 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1293 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1294 _max_youtube_results = 1000
1295 IE_NAME = u'youtube:search'
1297 def __init__(self, downloader=None):
1298 InfoExtractor.__init__(self, downloader)
1300 def report_download_page(self, query, pagenum):
1301 """Report attempt to download search page with given number."""
1302 query = query.decode(preferredencoding())
1303 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1305 def _real_extract(self, query):
1306 mobj = re.match(self._VALID_URL, query)
1308 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1311 prefix, query = query.split(':')
1313 query = query.encode('utf-8')
1315 self._download_n_results(query, 1)
1317 elif prefix == 'all':
1318 self._download_n_results(query, self._max_youtube_results)
1324 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1326 elif n > self._max_youtube_results:
1327 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1328 n = self._max_youtube_results
1329 self._download_n_results(query, n)
1331 except ValueError: # parsing prefix as integer fails
1332 self._download_n_results(query, 1)
1335 def _download_n_results(self, query, n):
1336 """Downloads a specified number of results for a query"""
1342 while (50 * pagenum) < limit:
1343 self.report_download_page(query, pagenum+1)
1344 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1345 request = urllib2.Request(result_url)
1347 data = urllib2.urlopen(request).read()
1348 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1349 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1351 api_response = json.loads(data)['data']
1353 new_ids = list(video['id'] for video in api_response['items'])
1354 video_ids += new_ids
1356 limit = min(n, api_response['totalItems'])
1359 if len(video_ids) > n:
1360 video_ids = video_ids[:n]
1361 for id in video_ids:
1362 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1366 class GoogleSearchIE(InfoExtractor):
1367 """Information Extractor for Google Video search queries."""
1368 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1369 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1370 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1371 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1372 _max_google_results = 1000
1373 IE_NAME = u'video.google:search'
1375 def __init__(self, downloader=None):
1376 InfoExtractor.__init__(self, downloader)
1378 def report_download_page(self, query, pagenum):
1379 """Report attempt to download playlist page with given number."""
1380 query = query.decode(preferredencoding())
1381 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1383 def _real_extract(self, query):
1384 mobj = re.match(self._VALID_URL, query)
1386 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1389 prefix, query = query.split(':')
1391 query = query.encode('utf-8')
1393 self._download_n_results(query, 1)
1395 elif prefix == 'all':
1396 self._download_n_results(query, self._max_google_results)
1402 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1404 elif n > self._max_google_results:
1405 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1406 n = self._max_google_results
1407 self._download_n_results(query, n)
1409 except ValueError: # parsing prefix as integer fails
1410 self._download_n_results(query, 1)
1413 def _download_n_results(self, query, n):
1414 """Downloads a specified number of results for a query"""
1420 self.report_download_page(query, pagenum)
1421 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1422 request = urllib2.Request(result_url)
1424 page = urllib2.urlopen(request).read()
1425 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1426 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1429 # Extract video identifiers
1430 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1431 video_id = mobj.group(1)
1432 if video_id not in video_ids:
1433 video_ids.append(video_id)
1434 if len(video_ids) == n:
1435 # Specified n videos reached
1436 for id in video_ids:
1437 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1440 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1441 for id in video_ids:
1442 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1445 pagenum = pagenum + 1
1448 class YahooSearchIE(InfoExtractor):
1449 """Information Extractor for Yahoo! Video search queries."""
1450 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1451 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1452 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1453 _MORE_PAGES_INDICATOR = r'\s*Next'
1454 _max_yahoo_results = 1000
1455 IE_NAME = u'video.yahoo:search'
1457 def __init__(self, downloader=None):
1458 InfoExtractor.__init__(self, downloader)
1460 def report_download_page(self, query, pagenum):
1461 """Report attempt to download playlist page with given number."""
1462 query = query.decode(preferredencoding())
1463 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1465 def _real_extract(self, query):
1466 mobj = re.match(self._VALID_URL, query)
1468 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1471 prefix, query = query.split(':')
1473 query = query.encode('utf-8')
1475 self._download_n_results(query, 1)
1477 elif prefix == 'all':
1478 self._download_n_results(query, self._max_yahoo_results)
1484 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1486 elif n > self._max_yahoo_results:
1487 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1488 n = self._max_yahoo_results
1489 self._download_n_results(query, n)
1491 except ValueError: # parsing prefix as integer fails
1492 self._download_n_results(query, 1)
1495 def _download_n_results(self, query, n):
1496 """Downloads a specified number of results for a query"""
1499 already_seen = set()
1503 self.report_download_page(query, pagenum)
1504 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1505 request = urllib2.Request(result_url)
1507 page = urllib2.urlopen(request).read()
1508 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1509 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1512 # Extract video identifiers
1513 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1514 video_id = mobj.group(1)
1515 if video_id not in already_seen:
1516 video_ids.append(video_id)
1517 already_seen.add(video_id)
1518 if len(video_ids) == n:
1519 # Specified n videos reached
1520 for id in video_ids:
1521 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1524 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1525 for id in video_ids:
1526 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1529 pagenum = pagenum + 1
1532 class YoutubePlaylistIE(InfoExtractor):
1533 """Information Extractor for YouTube playlists."""
1535 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1536 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1537 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1538 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1539 IE_NAME = u'youtube:playlist'
1541 def __init__(self, downloader=None):
1542 InfoExtractor.__init__(self, downloader)
1544 def report_download_page(self, playlist_id, pagenum):
1545 """Report attempt to download playlist page with given number."""
1546 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1548 def _real_extract(self, url):
1549 # Extract playlist id
1550 mobj = re.match(self._VALID_URL, url)
1552 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1556 if mobj.group(3) is not None:
1557 self._downloader.download([mobj.group(3)])
1560 # Download playlist pages
1561 # prefix is 'p' as default for playlists but there are other types that need extra care
1562 playlist_prefix = mobj.group(1)
1563 if playlist_prefix == 'a':
1564 playlist_access = 'artist'
1566 playlist_prefix = 'p'
1567 playlist_access = 'view_play_list'
1568 playlist_id = mobj.group(2)
1573 self.report_download_page(playlist_id, pagenum)
1574 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1575 request = urllib2.Request(url)
1577 page = urllib2.urlopen(request).read()
1578 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1579 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1582 # Extract video identifiers
1584 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1585 if mobj.group(1) not in ids_in_page:
1586 ids_in_page.append(mobj.group(1))
1587 video_ids.extend(ids_in_page)
1589 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1591 pagenum = pagenum + 1
1593 playliststart = self._downloader.params.get('playliststart', 1) - 1
1594 playlistend = self._downloader.params.get('playlistend', -1)
1595 if playlistend == -1:
1596 video_ids = video_ids[playliststart:]
1598 video_ids = video_ids[playliststart:playlistend]
1600 for id in video_ids:
1601 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1605 class YoutubeChannelIE(InfoExtractor):
1606 """Information Extractor for YouTube channels."""
1608 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1609 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1610 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1611 IE_NAME = u'youtube:channel'
1613 def report_download_page(self, channel_id, pagenum):
1614 """Report attempt to download channel page with given number."""
1615 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1617 def _real_extract(self, url):
1618 # Extract channel id
1619 mobj = re.match(self._VALID_URL, url)
1621 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1624 # Download channel pages
1625 channel_id = mobj.group(1)
1630 self.report_download_page(channel_id, pagenum)
1631 url = self._TEMPLATE_URL % (channel_id, pagenum)
1632 request = urllib2.Request(url)
1634 page = urllib2.urlopen(request).read()
1635 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1636 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1639 # Extract video identifiers
1641 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1642 if mobj.group(1) not in ids_in_page:
1643 ids_in_page.append(mobj.group(1))
1644 video_ids.extend(ids_in_page)
1646 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1648 pagenum = pagenum + 1
1650 for id in video_ids:
1651 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1655 class YoutubeUserIE(InfoExtractor):
1656 """Information Extractor for YouTube users."""
1658 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1659 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1660 _GDATA_PAGE_SIZE = 50
1661 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1662 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1663 IE_NAME = u'youtube:user'
1665 def __init__(self, downloader=None):
1666 InfoExtractor.__init__(self, downloader)
1668 def report_download_page(self, username, start_index):
1669 """Report attempt to download user page."""
1670 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1671 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1673 def _real_extract(self, url):
1675 mobj = re.match(self._VALID_URL, url)
1677 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1680 username = mobj.group(1)
1682 # Download video ids using YouTube Data API. Result size per
1683 # query is limited (currently to 50 videos) so we need to query
1684 # page by page until there are no video ids - it means we got
1691 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1692 self.report_download_page(username, start_index)
1694 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1697 page = urllib2.urlopen(request).read()
1698 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1699 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1702 # Extract video identifiers
1705 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1706 if mobj.group(1) not in ids_in_page:
1707 ids_in_page.append(mobj.group(1))
1709 video_ids.extend(ids_in_page)
1711 # A little optimization - if current page is not
1712 # "full", ie. does not contain PAGE_SIZE video ids then
1713 # we can assume that this page is the last one - there
1714 # are no more ids on further pages - no need to query
1717 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1722 all_ids_count = len(video_ids)
1723 playliststart = self._downloader.params.get('playliststart', 1) - 1
1724 playlistend = self._downloader.params.get('playlistend', -1)
1726 if playlistend == -1:
1727 video_ids = video_ids[playliststart:]
1729 video_ids = video_ids[playliststart:playlistend]
1731 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1732 (username, all_ids_count, len(video_ids)))
1734 for video_id in video_ids:
1735 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1738 class BlipTVUserIE(InfoExtractor):
1739 """Information Extractor for blip.tv users."""
1741 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1743 IE_NAME = u'blip.tv:user'
1745 def __init__(self, downloader=None):
1746 InfoExtractor.__init__(self, downloader)
1748 def report_download_page(self, username, pagenum):
1749 """Report attempt to download user page."""
1750 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1751 (self.IE_NAME, username, pagenum))
1753 def _real_extract(self, url):
1755 mobj = re.match(self._VALID_URL, url)
1757 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1760 username = mobj.group(1)
1762 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1764 request = urllib2.Request(url)
1767 page = urllib2.urlopen(request).read().decode('utf-8')
1768 mobj = re.search(r'data-users-id="([^"]+)"', page)
1769 page_base = page_base % mobj.group(1)
1770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1771 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1775 # Download video ids using BlipTV Ajax calls. Result size per
1776 # query is limited (currently to 12 videos) so we need to query
1777 # page by page until there are no video ids - it means we got
1784 self.report_download_page(username, pagenum)
1786 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1789 page = urllib2.urlopen(request).read().decode('utf-8')
1790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1791 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1794 # Extract video identifiers
1797 for mobj in re.finditer(r'href="/([^"]+)"', page):
1798 if mobj.group(1) not in ids_in_page:
1799 ids_in_page.append(unescapeHTML(mobj.group(1)))
1801 video_ids.extend(ids_in_page)
1803 # A little optimization - if current page is not
1804 # "full", ie. does not contain PAGE_SIZE video ids then
1805 # we can assume that this page is the last one - there
1806 # are no more ids on further pages - no need to query
1809 if len(ids_in_page) < self._PAGE_SIZE:
1814 all_ids_count = len(video_ids)
1815 playliststart = self._downloader.params.get('playliststart', 1) - 1
1816 playlistend = self._downloader.params.get('playlistend', -1)
1818 if playlistend == -1:
1819 video_ids = video_ids[playliststart:]
1821 video_ids = video_ids[playliststart:playlistend]
1823 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1824 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1826 for video_id in video_ids:
1827 self._downloader.download([u'http://blip.tv/'+video_id])
1830 class DepositFilesIE(InfoExtractor):
1831 """Information extractor for depositfiles.com"""
1833 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1834 IE_NAME = u'DepositFiles'
1836 def __init__(self, downloader=None):
1837 InfoExtractor.__init__(self, downloader)
1839 def report_download_webpage(self, file_id):
1840 """Report webpage download."""
1841 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1843 def report_extraction(self, file_id):
1844 """Report information extraction."""
1845 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1847 def _real_extract(self, url):
1848 file_id = url.split('/')[-1]
1849 # Rebuild url in english locale
1850 url = 'http://depositfiles.com/en/files/' + file_id
1852 # Retrieve file webpage with 'Free download' button pressed
1853 free_download_indication = { 'gateway_result' : '1' }
1854 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1856 self.report_download_webpage(file_id)
1857 webpage = urllib2.urlopen(request).read()
1858 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1859 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1862 # Search for the real file URL
1863 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1864 if (mobj is None) or (mobj.group(1) is None):
1865 # Try to figure out reason of the error.
1866 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1867 if (mobj is not None) and (mobj.group(1) is not None):
1868 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1869 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1871 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1874 file_url = mobj.group(1)
1875 file_extension = os.path.splitext(file_url)[1][1:]
1877 # Search for file title
1878 mobj = re.search(r'<b title="(.*?)">', webpage)
1880 self._downloader.trouble(u'ERROR: unable to extract title')
1882 file_title = mobj.group(1).decode('utf-8')
1885 'id': file_id.decode('utf-8'),
1886 'url': file_url.decode('utf-8'),
1888 'upload_date': u'NA',
1889 'title': file_title,
1890 'ext': file_extension.decode('utf-8'),
1896 class FacebookIE(InfoExtractor):
1897 """Information Extractor for Facebook"""
1899 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1900 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1901 _NETRC_MACHINE = 'facebook'
1902 _available_formats = ['video', 'highqual', 'lowqual']
1903 _video_extensions = {
1908 IE_NAME = u'facebook'
1910 def __init__(self, downloader=None):
1911 InfoExtractor.__init__(self, downloader)
1913 def _reporter(self, message):
1914 """Add header and report message."""
1915 self._downloader.to_screen(u'[facebook] %s' % message)
1917 def report_login(self):
1918 """Report attempt to log in."""
1919 self._reporter(u'Logging in')
1921 def report_video_webpage_download(self, video_id):
1922 """Report attempt to download video webpage."""
1923 self._reporter(u'%s: Downloading video webpage' % video_id)
1925 def report_information_extraction(self, video_id):
1926 """Report attempt to extract video information."""
1927 self._reporter(u'%s: Extracting video information' % video_id)
1929 def _parse_page(self, video_webpage):
1930 """Extract video information from page"""
1932 data = {'title': r'\("video_title", "(.*?)"\)',
1933 'description': r'<div class="datawrap">(.*?)</div>',
1934 'owner': r'\("video_owner_name", "(.*?)"\)',
1935 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1938 for piece in data.keys():
1939 mobj = re.search(data[piece], video_webpage)
1940 if mobj is not None:
1941 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1945 for fmt in self._available_formats:
1946 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1947 if mobj is not None:
1948 # URL is in a Javascript segment inside an escaped Unicode format within
1949 # the generally utf-8 page
1950 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1951 video_info['video_urls'] = video_urls
1955 def _real_initialize(self):
1956 if self._downloader is None:
1961 downloader_params = self._downloader.params
1963 # Attempt to use provided username and password or .netrc data
1964 if downloader_params.get('username', None) is not None:
1965 useremail = downloader_params['username']
1966 password = downloader_params['password']
1967 elif downloader_params.get('usenetrc', False):
1969 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1970 if info is not None:
1974 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1975 except (IOError, netrc.NetrcParseError), err:
1976 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1979 if useremail is None:
1988 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1991 login_results = urllib2.urlopen(request).read()
1992 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1993 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1995 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1996 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1999 def _real_extract(self, url):
2000 mobj = re.match(self._VALID_URL, url)
2002 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2004 video_id = mobj.group('ID')
2007 self.report_video_webpage_download(video_id)
2008 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2010 page = urllib2.urlopen(request)
2011 video_webpage = page.read()
2012 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2013 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2016 # Start extracting information
2017 self.report_information_extraction(video_id)
2019 # Extract information
2020 video_info = self._parse_page(video_webpage)
2023 if 'owner' not in video_info:
2024 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2026 video_uploader = video_info['owner']
2029 if 'title' not in video_info:
2030 self._downloader.trouble(u'ERROR: unable to extract video title')
2032 video_title = video_info['title']
2033 video_title = video_title.decode('utf-8')
2036 if 'thumbnail' not in video_info:
2037 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2038 video_thumbnail = ''
2040 video_thumbnail = video_info['thumbnail']
2044 if 'upload_date' in video_info:
2045 upload_time = video_info['upload_date']
2046 timetuple = email.utils.parsedate_tz(upload_time)
2047 if timetuple is not None:
2049 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2054 video_description = video_info.get('description', 'No description available.')
2056 url_map = video_info['video_urls']
2057 if len(url_map.keys()) > 0:
2058 # Decide which formats to download
2059 req_format = self._downloader.params.get('format', None)
2060 format_limit = self._downloader.params.get('format_limit', None)
2062 if format_limit is not None and format_limit in self._available_formats:
2063 format_list = self._available_formats[self._available_formats.index(format_limit):]
2065 format_list = self._available_formats
2066 existing_formats = [x for x in format_list if x in url_map]
2067 if len(existing_formats) == 0:
2068 self._downloader.trouble(u'ERROR: no known formats available for video')
2070 if req_format is None:
2071 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2072 elif req_format == 'worst':
2073 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2074 elif req_format == '-1':
2075 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2078 if req_format not in url_map:
2079 self._downloader.trouble(u'ERROR: requested format not available')
2081 video_url_list = [(req_format, url_map[req_format])] # Specific format
2084 for format_param, video_real_url in video_url_list:
2086 video_extension = self._video_extensions.get(format_param, 'mp4')
2089 'id': video_id.decode('utf-8'),
2090 'url': video_real_url.decode('utf-8'),
2091 'uploader': video_uploader.decode('utf-8'),
2092 'upload_date': upload_date,
2093 'title': video_title,
2094 'ext': video_extension.decode('utf-8'),
2095 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2096 'thumbnail': video_thumbnail.decode('utf-8'),
2097 'description': video_description.decode('utf-8'),
2102 class BlipTVIE(InfoExtractor):
2103 """Information extractor for blip.tv"""
2105 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2106 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2107 IE_NAME = u'blip.tv'
2109 def report_extraction(self, file_id):
2110 """Report information extraction."""
2111 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2113 def report_direct_download(self, title):
2114 """Report information extraction."""
2115 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2117 def _real_extract(self, url):
2118 mobj = re.match(self._VALID_URL, url)
2120 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2127 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2128 request = urllib2.Request(json_url.encode('utf-8'))
2129 self.report_extraction(mobj.group(1))
2132 urlh = urllib2.urlopen(request)
2133 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2134 basename = url.split('/')[-1]
2135 title,ext = os.path.splitext(basename)
2136 title = title.decode('UTF-8')
2137 ext = ext.replace('.', '')
2138 self.report_direct_download(title)
2146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2147 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2149 if info is None: # Regular URL
2151 json_code = urlh.read()
2152 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2153 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2157 json_data = json.loads(json_code)
2158 if 'Post' in json_data:
2159 data = json_data['Post']
2163 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2164 video_url = data['media']['url']
2165 umobj = re.match(self._URL_EXT, video_url)
2167 raise ValueError('Can not determine filename extension')
2168 ext = umobj.group(1)
2171 'id': data['item_id'],
2173 'uploader': data['display_name'],
2174 'upload_date': upload_date,
2175 'title': data['title'],
2177 'format': data['media']['mimeType'],
2178 'thumbnail': data['thumbnailUrl'],
2179 'description': data['description'],
2180 'player_url': data['embedUrl']
2182 except (ValueError,KeyError), err:
2183 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2186 std_headers['User-Agent'] = 'iTunes/10.6.1'
2190 class MyVideoIE(InfoExtractor):
2191 """Information Extractor for myvideo.de."""
2193 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2194 IE_NAME = u'myvideo'
2196 def __init__(self, downloader=None):
2197 InfoExtractor.__init__(self, downloader)
2199 def report_download_webpage(self, video_id):
2200 """Report webpage download."""
2201 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2203 def report_extraction(self, video_id):
2204 """Report information extraction."""
2205 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2207 def _real_extract(self,url):
2208 mobj = re.match(self._VALID_URL, url)
2210 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2213 video_id = mobj.group(1)
2216 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2218 self.report_download_webpage(video_id)
2219 webpage = urllib2.urlopen(request).read()
2220 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2221 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2224 self.report_extraction(video_id)
2225 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2228 self._downloader.trouble(u'ERROR: unable to extract media URL')
2230 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2232 mobj = re.search('<title>([^<]+)</title>', webpage)
2234 self._downloader.trouble(u'ERROR: unable to extract title')
2237 video_title = mobj.group(1)
2243 'upload_date': u'NA',
2244 'title': video_title,
2250 class ComedyCentralIE(InfoExtractor):
2251 """Information extractor for The Daily Show and Colbert Report """
2253 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2254 IE_NAME = u'comedycentral'
2256 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2258 _video_extensions = {
2266 _video_dimensions = {
2275 def report_extraction(self, episode_id):
2276 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2278 def report_config_download(self, episode_id):
2279 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2281 def report_index_download(self, episode_id):
2282 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2284 def report_player_url(self, episode_id):
2285 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2288 def _print_formats(self, formats):
2289 print('Available formats:')
2291 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2294 def _real_extract(self, url):
2295 mobj = re.match(self._VALID_URL, url)
2297 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2300 if mobj.group('shortname'):
2301 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2302 url = u'http://www.thedailyshow.com/full-episodes/'
2304 url = u'http://www.colbertnation.com/full-episodes/'
2305 mobj = re.match(self._VALID_URL, url)
2306 assert mobj is not None
2308 dlNewest = not mobj.group('episode')
2310 epTitle = mobj.group('showname')
2312 epTitle = mobj.group('episode')
2314 req = urllib2.Request(url)
2315 self.report_extraction(epTitle)
2317 htmlHandle = urllib2.urlopen(req)
2318 html = htmlHandle.read()
2319 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2320 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2323 url = htmlHandle.geturl()
2324 mobj = re.match(self._VALID_URL, url)
2326 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2328 if mobj.group('episode') == '':
2329 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2331 epTitle = mobj.group('episode')
2333 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2334 if len(mMovieParams) == 0:
2335 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2338 playerUrl_raw = mMovieParams[0][0]
2339 self.report_player_url(epTitle)
2341 urlHandle = urllib2.urlopen(playerUrl_raw)
2342 playerUrl = urlHandle.geturl()
2343 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2344 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2347 uri = mMovieParams[0][1]
2348 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2349 self.report_index_download(epTitle)
2351 indexXml = urllib2.urlopen(indexUrl).read()
2352 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2353 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2358 idoc = xml.etree.ElementTree.fromstring(indexXml)
2359 itemEls = idoc.findall('.//item')
2360 for itemEl in itemEls:
2361 mediaId = itemEl.findall('./guid')[0].text
2362 shortMediaId = mediaId.split(':')[-1]
2363 showId = mediaId.split(':')[-2].replace('.com', '')
2364 officialTitle = itemEl.findall('./title')[0].text
2365 officialDate = itemEl.findall('./pubDate')[0].text
2367 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2368 urllib.urlencode({'uri': mediaId}))
2369 configReq = urllib2.Request(configUrl)
2370 self.report_config_download(epTitle)
2372 configXml = urllib2.urlopen(configReq).read()
2373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2374 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2377 cdoc = xml.etree.ElementTree.fromstring(configXml)
2379 for rendition in cdoc.findall('.//rendition'):
2380 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2384 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2387 if self._downloader.params.get('listformats', None):
2388 self._print_formats([i[0] for i in turls])
2391 # For now, just pick the highest bitrate
2392 format,video_url = turls[-1]
2394 # Get the format arg from the arg stream
2395 req_format = self._downloader.params.get('format', None)
2397 # Select format if we can find one
2400 format, video_url = f, v
2403 # Patch to download from alternative CDN, which does not
2404 # break on current RTMPDump builds
2405 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2406 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2408 if video_url.startswith(broken_cdn):
2409 video_url = video_url.replace(broken_cdn, better_cdn)
2411 effTitle = showId + u'-' + epTitle
2416 'upload_date': officialDate,
2421 'description': officialTitle,
2422 'player_url': None #playerUrl
2425 results.append(info)
2430 class EscapistIE(InfoExtractor):
2431 """Information extractor for The Escapist """
2433 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2434 IE_NAME = u'escapist'
2436 def report_extraction(self, showName):
2437 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2439 def report_config_download(self, showName):
2440 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2442 def _real_extract(self, url):
2443 mobj = re.match(self._VALID_URL, url)
2445 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2447 showName = mobj.group('showname')
2448 videoId = mobj.group('episode')
2450 self.report_extraction(showName)
2452 webPage = urllib2.urlopen(url)
2453 webPageBytes = webPage.read()
2454 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2455 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2456 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2457 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2460 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2461 description = unescapeHTML(descMatch.group(1))
2462 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2463 imgUrl = unescapeHTML(imgMatch.group(1))
2464 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2465 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2466 configUrlMatch = re.search('config=(.*)$', playerUrl)
2467 configUrl = urllib2.unquote(configUrlMatch.group(1))
2469 self.report_config_download(showName)
2471 configJSON = urllib2.urlopen(configUrl).read()
2472 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2473 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2476 # Technically, it's JavaScript, not JSON
2477 configJSON = configJSON.replace("'", '"')
2480 config = json.loads(configJSON)
2481 except (ValueError,), err:
2482 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2485 playlist = config['playlist']
2486 videoUrl = playlist[1]['url']
2491 'uploader': showName,
2492 'upload_date': None,
2496 'thumbnail': imgUrl,
2497 'description': description,
2498 'player_url': playerUrl,
2504 class CollegeHumorIE(InfoExtractor):
2505 """Information extractor for collegehumor.com"""
2507 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2508 IE_NAME = u'collegehumor'
2510 def report_webpage(self, video_id):
2511 """Report information extraction."""
2512 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2514 def report_extraction(self, video_id):
2515 """Report information extraction."""
2516 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2518 def _real_extract(self, url):
2519 mobj = re.match(self._VALID_URL, url)
2521 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2523 video_id = mobj.group('videoid')
2525 self.report_webpage(video_id)
2526 request = urllib2.Request(url)
2528 webpage = urllib2.urlopen(request).read()
2529 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2530 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2533 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2535 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2537 internal_video_id = m.group('internalvideoid')
2541 'internal_id': internal_video_id,
2544 self.report_extraction(video_id)
2545 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2547 metaXml = urllib2.urlopen(xmlUrl).read()
2548 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2549 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2552 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2554 videoNode = mdoc.findall('./video')[0]
2555 info['description'] = videoNode.findall('./description')[0].text
2556 info['title'] = videoNode.findall('./caption')[0].text
2557 info['url'] = videoNode.findall('./file')[0].text
2558 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2559 info['ext'] = info['url'].rpartition('.')[2]
2560 info['format'] = info['ext']
2562 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2568 class XVideosIE(InfoExtractor):
2569 """Information extractor for xvideos.com"""
2571 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2572 IE_NAME = u'xvideos'
2574 def report_webpage(self, video_id):
2575 """Report information extraction."""
2576 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2578 def report_extraction(self, video_id):
2579 """Report information extraction."""
2580 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2582 def _real_extract(self, url):
2583 mobj = re.match(self._VALID_URL, url)
2585 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2587 video_id = mobj.group(1).decode('utf-8')
2589 self.report_webpage(video_id)
2591 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2593 webpage = urllib2.urlopen(request).read()
2594 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2595 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2598 self.report_extraction(video_id)
2602 mobj = re.search(r'flv_url=(.+?)&', webpage)
2604 self._downloader.trouble(u'ERROR: unable to extract video url')
2606 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2610 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2612 self._downloader.trouble(u'ERROR: unable to extract video title')
2614 video_title = mobj.group(1).decode('utf-8')
2617 # Extract video thumbnail
2618 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2620 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2622 video_thumbnail = mobj.group(0).decode('utf-8')
2628 'upload_date': None,
2629 'title': video_title,
2632 'thumbnail': video_thumbnail,
2633 'description': None,
2640 class SoundcloudIE(InfoExtractor):
2641 """Information extractor for soundcloud.com
2642 To access the media, the uid of the song and a stream token
2643 must be extracted from the page source and the script must make
2644 a request to media.soundcloud.com/crossdomain.xml. Then
2645 the media can be grabbed by requesting from an url composed
2646 of the stream token and uid
2649 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2650 IE_NAME = u'soundcloud'
2652 def __init__(self, downloader=None):
2653 InfoExtractor.__init__(self, downloader)
2655 def report_webpage(self, video_id):
2656 """Report information extraction."""
2657 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2659 def report_extraction(self, video_id):
2660 """Report information extraction."""
2661 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2663 def _real_extract(self, url):
2664 mobj = re.match(self._VALID_URL, url)
2666 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2669 # extract uploader (which is in the url)
2670 uploader = mobj.group(1).decode('utf-8')
2671 # extract simple title (uploader + slug of song title)
2672 slug_title = mobj.group(2).decode('utf-8')
2673 simple_title = uploader + u'-' + slug_title
2675 self.report_webpage('%s/%s' % (uploader, slug_title))
2677 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2679 webpage = urllib2.urlopen(request).read()
2680 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2681 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2684 self.report_extraction('%s/%s' % (uploader, slug_title))
2686 # extract uid and stream token that soundcloud hands out for access
2687 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2689 video_id = mobj.group(1)
2690 stream_token = mobj.group(2)
2692 # extract unsimplified title
2693 mobj = re.search('"title":"(.*?)",', webpage)
2695 title = mobj.group(1).decode('utf-8')
2697 title = simple_title
2699 # construct media url (with uid/token)
2700 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2701 mediaURL = mediaURL % (video_id, stream_token)
2704 description = u'No description available'
2705 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2707 description = mobj.group(1)
2711 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2714 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2715 except Exception, e:
2716 self._downloader.to_stderr(compat_str(e))
2718 # for soundcloud, a request to a cross domain is required for cookies
2719 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2722 'id': video_id.decode('utf-8'),
2724 'uploader': uploader.decode('utf-8'),
2725 'upload_date': upload_date,
2730 'description': description.decode('utf-8')
2734 class InfoQIE(InfoExtractor):
2735 """Information extractor for infoq.com"""
2737 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2740 def report_webpage(self, video_id):
2741 """Report information extraction."""
2742 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2744 def report_extraction(self, video_id):
2745 """Report information extraction."""
2746 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2748 def _real_extract(self, url):
2749 mobj = re.match(self._VALID_URL, url)
2751 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2754 self.report_webpage(url)
2756 request = urllib2.Request(url)
2758 webpage = urllib2.urlopen(request).read()
2759 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2760 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2763 self.report_extraction(url)
2767 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2769 self._downloader.trouble(u'ERROR: unable to extract video url')
2771 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2775 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2777 self._downloader.trouble(u'ERROR: unable to extract video title')
2779 video_title = mobj.group(1).decode('utf-8')
2781 # Extract description
2782 video_description = u'No description available.'
2783 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2784 if mobj is not None:
2785 video_description = mobj.group(1).decode('utf-8')
2787 video_filename = video_url.split('/')[-1]
2788 video_id, extension = video_filename.split('.')
2794 'upload_date': None,
2795 'title': video_title,
2797 'format': extension, # Extension is always(?) mp4, but seems to be flv
2799 'description': video_description,
2805 class MixcloudIE(InfoExtractor):
2806 """Information extractor for www.mixcloud.com"""
2807 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2808 IE_NAME = u'mixcloud'
2810 def __init__(self, downloader=None):
2811 InfoExtractor.__init__(self, downloader)
2813 def report_download_json(self, file_id):
2814 """Report JSON download."""
2815 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2817 def report_extraction(self, file_id):
2818 """Report information extraction."""
2819 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2821 def get_urls(self, jsonData, fmt, bitrate='best'):
2822 """Get urls from 'audio_formats' section in json"""
2825 bitrate_list = jsonData[fmt]
2826 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2827 bitrate = max(bitrate_list) # select highest
2829 url_list = jsonData[fmt][bitrate]
2830 except TypeError: # we have no bitrate info.
2831 url_list = jsonData[fmt]
2834 def check_urls(self, url_list):
2835 """Returns 1st active url from list"""
2836 for url in url_list:
2838 urllib2.urlopen(url)
2840 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2845 def _print_formats(self, formats):
2846 print('Available formats:')
2847 for fmt in formats.keys():
2848 for b in formats[fmt]:
2850 ext = formats[fmt][b][0]
2851 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2852 except TypeError: # we have no bitrate info
2853 ext = formats[fmt][0]
2854 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2857 def _real_extract(self, url):
2858 mobj = re.match(self._VALID_URL, url)
2860 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2862 # extract uploader & filename from url
2863 uploader = mobj.group(1).decode('utf-8')
2864 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2866 # construct API request
2867 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2868 # retrieve .json file with links to files
2869 request = urllib2.Request(file_url)
2871 self.report_download_json(file_url)
2872 jsonData = urllib2.urlopen(request).read()
2873 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2874 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2878 json_data = json.loads(jsonData)
2879 player_url = json_data['player_swf_url']
2880 formats = dict(json_data['audio_formats'])
2882 req_format = self._downloader.params.get('format', None)
2885 if self._downloader.params.get('listformats', None):
2886 self._print_formats(formats)
2889 if req_format is None or req_format == 'best':
2890 for format_param in formats.keys():
2891 url_list = self.get_urls(formats, format_param)
2893 file_url = self.check_urls(url_list)
2894 if file_url is not None:
2897 if req_format not in formats.keys():
2898 self._downloader.trouble(u'ERROR: format is not available')
2901 url_list = self.get_urls(formats, req_format)
2902 file_url = self.check_urls(url_list)
2903 format_param = req_format
2906 'id': file_id.decode('utf-8'),
2907 'url': file_url.decode('utf-8'),
2908 'uploader': uploader.decode('utf-8'),
2909 'upload_date': u'NA',
2910 'title': json_data['name'],
2911 'ext': file_url.split('.')[-1].decode('utf-8'),
2912 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2913 'thumbnail': json_data['thumbnail_url'],
2914 'description': json_data['description'],
2915 'player_url': player_url.decode('utf-8'),
2918 class StanfordOpenClassroomIE(InfoExtractor):
2919 """Information extractor for Stanford's Open ClassRoom"""
2921 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2922 IE_NAME = u'stanfordoc'
2924 def report_download_webpage(self, objid):
2925 """Report information extraction."""
2926 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2928 def report_extraction(self, video_id):
2929 """Report information extraction."""
2930 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2932 def _real_extract(self, url):
2933 mobj = re.match(self._VALID_URL, url)
2935 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2938 if mobj.group('course') and mobj.group('video'): # A specific video
2939 course = mobj.group('course')
2940 video = mobj.group('video')
2942 'id': course + '_' + video,
2945 self.report_extraction(info['id'])
2946 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2947 xmlUrl = baseUrl + video + '.xml'
2949 metaXml = urllib2.urlopen(xmlUrl).read()
2950 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2951 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2953 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2955 info['title'] = mdoc.findall('./title')[0].text
2956 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2958 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2960 info['ext'] = info['url'].rpartition('.')[2]
2961 info['format'] = info['ext']
2963 elif mobj.group('course'): # A course page
2964 course = mobj.group('course')
2970 self.report_download_webpage(info['id'])
2972 coursepage = urllib2.urlopen(url).read()
2973 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2974 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2977 m = re.search('<h1>([^<]+)</h1>', coursepage)
2979 info['title'] = unescapeHTML(m.group(1))
2981 info['title'] = info['id']
2983 m = re.search('<description>([^<]+)</description>', coursepage)
2985 info['description'] = unescapeHTML(m.group(1))
2987 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2990 'type': 'reference',
2991 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2995 for entry in info['list']:
2996 assert entry['type'] == 'reference'
2997 results += self.extract(entry['url'])
3002 'id': 'Stanford OpenClassroom',
3006 self.report_download_webpage(info['id'])
3007 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3009 rootpage = urllib2.urlopen(rootURL).read()
3010 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3011 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3014 info['title'] = info['id']
3016 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3019 'type': 'reference',
3020 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3025 for entry in info['list']:
3026 assert entry['type'] == 'reference'
3027 results += self.extract(entry['url'])
3030 class MTVIE(InfoExtractor):
3031 """Information extractor for MTV.com"""
3033 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3036 def report_webpage(self, video_id):
3037 """Report information extraction."""
3038 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3040 def report_extraction(self, video_id):
3041 """Report information extraction."""
3042 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3044 def _real_extract(self, url):
3045 mobj = re.match(self._VALID_URL, url)
3047 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3049 if not mobj.group('proto'):
3050 url = 'http://' + url
3051 video_id = mobj.group('videoid')
3052 self.report_webpage(video_id)
3054 request = urllib2.Request(url)
3056 webpage = urllib2.urlopen(request).read()
3057 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3058 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3061 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3063 self._downloader.trouble(u'ERROR: unable to extract song name')
3065 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3068 self._downloader.trouble(u'ERROR: unable to extract performer')
3070 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3071 video_title = performer + ' - ' + song_name
3073 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3075 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3077 mtvn_uri = mobj.group(1)
3079 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3081 self._downloader.trouble(u'ERROR: unable to extract content id')
3083 content_id = mobj.group(1)
3085 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3086 self.report_extraction(video_id)
3087 request = urllib2.Request(videogen_url)
3089 metadataXml = urllib2.urlopen(request).read()
3090 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3091 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3094 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3095 renditions = mdoc.findall('.//rendition')
3097 # For now, always pick the highest quality.
3098 rendition = renditions[-1]
3101 _,_,ext = rendition.attrib['type'].partition('/')
3102 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3103 video_url = rendition.find('./src').text
3105 self._downloader.trouble('Invalid rendition field.')
3111 'uploader': performer,
3112 'title': video_title,
3120 class YoukuIE(InfoExtractor):
3122 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3125 def __init__(self, downloader=None):
3126 InfoExtractor.__init__(self, downloader)
3128 def report_download_webpage(self, file_id):
3129 """Report webpage download."""
3130 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3132 def report_extraction(self, file_id):
3133 """Report information extraction."""
3134 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3137 nowTime = int(time.time() * 1000)
3138 random1 = random.randint(1000,1998)
3139 random2 = random.randint(1000,9999)
3141 return "%d%d%d" %(nowTime,random1,random2)
3143 def _get_file_ID_mix_string(self, seed):
3145 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3147 for i in range(len(source)):
3148 seed = (seed * 211 + 30031 ) % 65536
3149 index = math.floor(seed / 65536 * len(source) )
3150 mixed.append(source[int(index)])
3151 source.remove(source[int(index)])
3152 #return ''.join(mixed)
3155 def _get_file_id(self, fileId, seed):
3156 mixed = self._get_file_ID_mix_string(seed)
3157 ids = fileId.split('*')
3161 realId.append(mixed[int(ch)])
3162 return ''.join(realId)
3164 def _real_extract(self, url):
3165 mobj = re.match(self._VALID_URL, url)
3167 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3169 video_id = mobj.group('ID')
3171 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3173 request = urllib2.Request(info_url, None, std_headers)
3175 self.report_download_webpage(video_id)
3176 jsondata = urllib2.urlopen(request).read()
3177 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3178 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3181 self.report_extraction(video_id)
3183 config = json.loads(jsondata)
3185 video_title = config['data'][0]['title']
3186 seed = config['data'][0]['seed']
3188 format = self._downloader.params.get('format', None)
3189 supported_format = config['data'][0]['streamfileids'].keys()
3191 if format is None or format == 'best':
3192 if 'hd2' in supported_format:
3197 elif format == 'worst':
3205 fileid = config['data'][0]['streamfileids'][format]
3206 seg_number = len(config['data'][0]['segs'][format])
3209 for i in xrange(seg_number):
3210 keys.append(config['data'][0]['segs'][format][i]['k'])
3213 #youku only could be viewed from mainland china
3215 self._downloader.trouble(u'ERROR: unable to extract info section')
3219 sid = self._gen_sid()
3220 fileid = self._get_file_id(fileid, seed)
3222 #column 8,9 of fileid represent the segment number
3223 #fileid[7:9] should be changed
3224 for index, key in enumerate(keys):
3226 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3227 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3230 'id': '%s_part%02d' % (video_id, index),
3231 'url': download_url,
3233 'title': video_title,
3237 files_info.append(info)
3242 class XNXXIE(InfoExtractor):
3243 """Information extractor for xnxx.com"""
3245 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3247 VIDEO_URL_RE = r'flv_url=(.*?)&'
3248 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3249 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3251 def report_webpage(self, video_id):
3252 """Report information extraction"""
3253 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3255 def report_extraction(self, video_id):
3256 """Report information extraction"""
3257 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3259 def _real_extract(self, url):
3260 mobj = re.match(self._VALID_URL, url)
3262 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3264 video_id = mobj.group(1).decode('utf-8')
3266 self.report_webpage(video_id)
3268 # Get webpage content
3270 webpage = urllib2.urlopen(url).read()
3271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3272 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3275 result = re.search(self.VIDEO_URL_RE, webpage)
3277 self._downloader.trouble(u'ERROR: unable to extract video url')
3279 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3281 result = re.search(self.VIDEO_TITLE_RE, webpage)
3283 self._downloader.trouble(u'ERROR: unable to extract video title')
3285 video_title = result.group(1).decode('utf-8')
3287 result = re.search(self.VIDEO_THUMB_RE, webpage)
3289 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3291 video_thumbnail = result.group(1).decode('utf-8')
3293 info = {'id': video_id,
3296 'upload_date': None,
3297 'title': video_title,
3300 'thumbnail': video_thumbnail,
3301 'description': None,
3307 class GooglePlusIE(InfoExtractor):
3308 """Information extractor for plus.google.com."""
3310 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3311 IE_NAME = u'plus.google'
3313 def __init__(self, downloader=None):
3314 InfoExtractor.__init__(self, downloader)
3316 def report_extract_entry(self, url):
3317 """Report downloading extry"""
3318 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3320 def report_date(self, upload_date):
3321 """Report downloading extry"""
3322 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3324 def report_uploader(self, uploader):
3325 """Report downloading extry"""
3326 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3328 def report_title(self, video_title):
3329 """Report downloading extry"""
3330 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3332 def report_extract_vid_page(self, video_page):
3333 """Report information extraction."""
3334 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3336 def _real_extract(self, url):
3337 # Extract id from URL
3338 mobj = re.match(self._VALID_URL, url)
3340 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3343 post_url = mobj.group(0)
3344 video_id = mobj.group(2)
3346 video_extension = 'flv'
3348 # Step 1, Retrieve post webpage to extract further information
3349 self.report_extract_entry(post_url)
3350 request = urllib2.Request(post_url)
3352 webpage = urllib2.urlopen(request).read()
3353 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3354 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3357 # Extract update date
3359 pattern = 'title="Timestamp">(.*?)</a>'
3360 mobj = re.search(pattern, webpage)
3362 upload_date = mobj.group(1)
3363 # Convert timestring to a format suitable for filename
3364 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3365 upload_date = upload_date.strftime('%Y%m%d')
3366 self.report_date(upload_date)
3370 pattern = r'rel\="author".*?>(.*?)</a>'
3371 mobj = re.search(pattern, webpage)
3373 uploader = mobj.group(1)
3374 self.report_uploader(uploader)
3377 # Get the first line for title
3379 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3380 mobj = re.search(pattern, webpage)
3382 video_title = mobj.group(1)
3383 self.report_title(video_title)
3385 # Step 2, Stimulate clicking the image box to launch video
3386 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3387 mobj = re.search(pattern, webpage)
3389 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3391 video_page = mobj.group(1)
3392 request = urllib2.Request(video_page)
3394 webpage = urllib2.urlopen(request).read()
3395 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3396 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3398 self.report_extract_vid_page(video_page)
3401 # Extract video links on video page
3402 """Extract video links of all sizes"""
3403 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3404 mobj = re.findall(pattern, webpage)
3406 self._downloader.trouble(u'ERROR: unable to extract video links')
3408 # Sort in resolution
3409 links = sorted(mobj)
3411 # Choose the lowest of the sort, i.e. highest resolution
3412 video_url = links[-1]
3413 # Only get the url. The resolution part in the tuple has no use anymore
3414 video_url = video_url[-1]
3415 # Treat escaped \u0026 style hex
3416 video_url = unicode(video_url, "unicode_escape")
3420 'id': video_id.decode('utf-8'),
3422 'uploader': uploader.decode('utf-8'),
3423 'upload_date': upload_date.decode('utf-8'),
3424 'title': video_title.decode('utf-8'),
3425 'ext': video_extension.decode('utf-8'),