2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
42 uploader: Nickname of the video uploader.
44 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
67 self.set_downloader(downloader)
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
74 """Initializes an instance (authentication, etc)."""
76 self._real_initialize()
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
82 return self._real_extract(url)
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
100 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|tube\.majestyc\.net/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
101 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
102 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
103 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
104 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
105 _NETRC_MACHINE = 'youtube'
106 # Listed in order of quality
107 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
108 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
109 _video_extensions = {
115 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
121 _video_dimensions = {
139 def report_lang(self):
140 """Report attempt to set language."""
141 self._downloader.to_screen(u'[youtube] Setting language')
143 def report_login(self):
144 """Report attempt to log in."""
145 self._downloader.to_screen(u'[youtube] Logging in')
147 def report_age_confirmation(self):
148 """Report attempt to confirm age."""
149 self._downloader.to_screen(u'[youtube] Confirming age')
151 def report_video_webpage_download(self, video_id):
152 """Report attempt to download video webpage."""
153 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
155 def report_video_info_webpage_download(self, video_id):
156 """Report attempt to download video info webpage."""
157 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
159 def report_video_subtitles_download(self, video_id):
160 """Report attempt to download video info webpage."""
161 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
163 def report_information_extraction(self, video_id):
164 """Report attempt to extract video information."""
165 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
167 def report_unavailable_format(self, video_id, format):
168 """Report extracted video URL."""
169 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
171 def report_rtmp_download(self):
172 """Indicate the download will use the RTMP protocol."""
173 self._downloader.to_screen(u'[youtube] RTMP download detected')
175 def _closed_captions_xml_to_srt(self, xml_string):
177 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
178 # TODO parse xml instead of regex
179 for n, (start, dur_tag, dur, caption) in enumerate(texts):
180 if not dur: dur = '4'
182 end = start + float(dur)
183 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
184 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
185 caption = unescapeHTML(caption)
186 caption = unescapeHTML(caption) # double cycle, intentional
187 srt += str(n+1) + '\n'
188 srt += start + ' --> ' + end + '\n'
189 srt += caption + '\n\n'
192 def _print_formats(self, formats):
193 print 'Available formats:'
195 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
197 def _real_initialize(self):
198 if self._downloader is None:
203 downloader_params = self._downloader.params
205 # Attempt to use provided username and password or .netrc data
206 if downloader_params.get('username', None) is not None:
207 username = downloader_params['username']
208 password = downloader_params['password']
209 elif downloader_params.get('usenetrc', False):
211 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
216 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
217 except (IOError, netrc.NetrcParseError), err:
218 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
222 request = urllib2.Request(self._LANG_URL)
225 urllib2.urlopen(request).read()
226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
227 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
230 # No authentication to be performed
236 'current_form': 'loginForm',
238 'action_login': 'Log In',
239 'username': username,
240 'password': password,
242 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
245 login_results = urllib2.urlopen(request).read()
246 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
247 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
250 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
256 'action_confirm': 'Confirm',
258 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
260 self.report_age_confirmation()
261 age_results = urllib2.urlopen(request).read()
262 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
263 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
266 def _real_extract(self, url):
267 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
268 mobj = re.search(self._NEXT_URL_RE, url)
270 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
272 # Extract video id from URL
273 mobj = re.match(self._VALID_URL, url)
275 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
277 video_id = mobj.group(2)
280 self.report_video_webpage_download(video_id)
281 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
283 video_webpage = urllib2.urlopen(request).read()
284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
285 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
288 # Attempt to extract SWF player URL
289 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
291 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
296 self.report_video_info_webpage_download(video_id)
297 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
298 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
299 % (video_id, el_type))
300 request = urllib2.Request(video_info_url)
302 video_info_webpage = urllib2.urlopen(request).read()
303 video_info = parse_qs(video_info_webpage)
304 if 'token' in video_info:
306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
307 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
309 if 'token' not in video_info:
310 if 'reason' in video_info:
311 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
313 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
316 # Check for "rental" videos
317 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
318 self._downloader.trouble(u'ERROR: "rental" videos not supported')
321 # Start extracting information
322 self.report_information_extraction(video_id)
325 if 'author' not in video_info:
326 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
328 video_uploader = urllib.unquote_plus(video_info['author'][0])
331 if 'title' not in video_info:
332 self._downloader.trouble(u'ERROR: unable to extract video title')
334 video_title = urllib.unquote_plus(video_info['title'][0])
335 video_title = video_title.decode('utf-8')
338 if 'thumbnail_url' not in video_info:
339 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
341 else: # don't panic if we can't find it
342 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
346 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
348 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
349 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
350 for expression in format_expressions:
352 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
357 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
358 if video_description: video_description = clean_html(video_description)
359 else: video_description = ''
362 video_subtitles = None
363 if self._downloader.params.get('writesubtitles', False):
365 self.report_video_subtitles_download(video_id)
366 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
368 srt_list = urllib2.urlopen(request).read()
369 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
370 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
371 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
372 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
373 if not srt_lang_list:
374 raise Trouble(u'WARNING: video has no closed captions')
375 if self._downloader.params.get('subtitleslang', False):
376 srt_lang = self._downloader.params.get('subtitleslang')
377 elif 'en' in srt_lang_list:
380 srt_lang = srt_lang_list.keys()[0]
381 if not srt_lang in srt_lang_list:
382 raise Trouble(u'WARNING: no closed captions found in the specified language')
383 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
385 srt_xml = urllib2.urlopen(request).read()
386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
387 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
389 raise Trouble(u'WARNING: unable to download video subtitles')
390 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
391 except Trouble as trouble:
392 self._downloader.trouble(trouble[0])
395 video_token = urllib.unquote_plus(video_info['token'][0])
397 # Decide which formats to download
398 req_format = self._downloader.params.get('format', None)
400 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
401 self.report_rtmp_download()
402 video_url_list = [(None, video_info['conn'][0])]
403 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
404 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
405 url_data = [parse_qs(uds) for uds in url_data_strs]
406 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
407 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
409 format_limit = self._downloader.params.get('format_limit', None)
410 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
411 if format_limit is not None and format_limit in available_formats:
412 format_list = available_formats[available_formats.index(format_limit):]
414 format_list = available_formats
415 existing_formats = [x for x in format_list if x in url_map]
416 if len(existing_formats) == 0:
417 self._downloader.trouble(u'ERROR: no known formats available for video')
419 if self._downloader.params.get('listformats', None):
420 self._print_formats(existing_formats)
422 if req_format is None or req_format == 'best':
423 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
424 elif req_format == 'worst':
425 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
426 elif req_format in ('-1', 'all'):
427 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
429 # Specific formats. We pick the first in a slash-delimeted sequence.
430 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
431 req_formats = req_format.split('/')
432 video_url_list = None
433 for rf in req_formats:
435 video_url_list = [(rf, url_map[rf])]
437 if video_url_list is None:
438 self._downloader.trouble(u'ERROR: requested format not available')
441 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
445 for format_param, video_real_url in video_url_list:
447 video_extension = self._video_extensions.get(format_param, 'flv')
450 'id': video_id.decode('utf-8'),
451 'url': video_real_url.decode('utf-8'),
452 'uploader': video_uploader.decode('utf-8'),
453 'upload_date': upload_date,
454 'title': video_title,
455 'ext': video_extension.decode('utf-8'),
456 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
457 'thumbnail': video_thumbnail.decode('utf-8'),
458 'description': video_description,
459 'player_url': player_url,
460 'subtitles': video_subtitles
465 class MetacafeIE(InfoExtractor):
466 """Information Extractor for metacafe.com."""
468 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
469 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
470 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
471 IE_NAME = u'metacafe'
473 def __init__(self, downloader=None):
474 InfoExtractor.__init__(self, downloader)
476 def report_disclaimer(self):
477 """Report disclaimer retrieval."""
478 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
480 def report_age_confirmation(self):
481 """Report attempt to confirm age."""
482 self._downloader.to_screen(u'[metacafe] Confirming age')
484 def report_download_webpage(self, video_id):
485 """Report webpage download."""
486 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
488 def report_extraction(self, video_id):
489 """Report information extraction."""
490 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
492 def _real_initialize(self):
493 # Retrieve disclaimer
494 request = urllib2.Request(self._DISCLAIMER)
496 self.report_disclaimer()
497 disclaimer = urllib2.urlopen(request).read()
498 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
499 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
505 'submit': "Continue - I'm over 18",
507 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
509 self.report_age_confirmation()
510 disclaimer = urllib2.urlopen(request).read()
511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
512 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
515 def _real_extract(self, url):
516 # Extract id and simplified title from URL
517 mobj = re.match(self._VALID_URL, url)
519 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
522 video_id = mobj.group(1)
524 # Check if video comes from YouTube
525 mobj2 = re.match(r'^yt-(.*)$', video_id)
526 if mobj2 is not None:
527 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
530 # Retrieve video webpage to extract further information
531 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
533 self.report_download_webpage(video_id)
534 webpage = urllib2.urlopen(request).read()
535 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
536 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
539 # Extract URL, uploader and title from webpage
540 self.report_extraction(video_id)
541 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
543 mediaURL = urllib.unquote(mobj.group(1))
544 video_extension = mediaURL[-3:]
546 # Extract gdaKey if available
547 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
551 gdaKey = mobj.group(1)
552 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
554 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
556 self._downloader.trouble(u'ERROR: unable to extract media URL')
558 vardict = parse_qs(mobj.group(1))
559 if 'mediaData' not in vardict:
560 self._downloader.trouble(u'ERROR: unable to extract media URL')
562 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
564 self._downloader.trouble(u'ERROR: unable to extract media URL')
566 mediaURL = mobj.group(1).replace('\\/', '/')
567 video_extension = mediaURL[-3:]
568 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
570 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
572 self._downloader.trouble(u'ERROR: unable to extract title')
574 video_title = mobj.group(1).decode('utf-8')
576 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
578 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
580 video_uploader = mobj.group(1)
583 'id': video_id.decode('utf-8'),
584 'url': video_url.decode('utf-8'),
585 'uploader': video_uploader.decode('utf-8'),
586 'upload_date': u'NA',
587 'title': video_title,
588 'ext': video_extension.decode('utf-8'),
594 class DailymotionIE(InfoExtractor):
595 """Information Extractor for Dailymotion"""
597 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
598 IE_NAME = u'dailymotion'
600 def __init__(self, downloader=None):
601 InfoExtractor.__init__(self, downloader)
603 def report_download_webpage(self, video_id):
604 """Report webpage download."""
605 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
607 def report_extraction(self, video_id):
608 """Report information extraction."""
609 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
611 def _real_extract(self, url):
612 # Extract id and simplified title from URL
613 mobj = re.match(self._VALID_URL, url)
615 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
618 video_id = mobj.group(1)
620 video_extension = 'mp4'
622 # Retrieve video webpage to extract further information
623 request = urllib2.Request(url)
624 request.add_header('Cookie', 'family_filter=off')
626 self.report_download_webpage(video_id)
627 webpage = urllib2.urlopen(request).read()
628 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
629 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
632 # Extract URL, uploader and title from webpage
633 self.report_extraction(video_id)
634 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
636 self._downloader.trouble(u'ERROR: unable to extract media URL')
638 flashvars = urllib.unquote(mobj.group(1))
639 if 'hqURL' in flashvars: max_quality = 'hqURL'
640 elif 'sdURL' in flashvars: max_quality = 'sdURL'
641 else: max_quality = 'ldURL'
642 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
644 self._downloader.trouble(u'ERROR: unable to extract media URL')
646 video_url = mobj.group(1).replace('\\/', '/')
648 # TODO: support choosing qualities
650 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
652 self._downloader.trouble(u'ERROR: unable to extract title')
654 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
656 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
658 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
660 video_uploader = mobj.group(1)
663 'id': video_id.decode('utf-8'),
664 'url': video_url.decode('utf-8'),
665 'uploader': video_uploader.decode('utf-8'),
666 'upload_date': u'NA',
667 'title': video_title,
668 'ext': video_extension.decode('utf-8'),
674 class GoogleIE(InfoExtractor):
675 """Information extractor for video.google.com."""
677 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
678 IE_NAME = u'video.google'
680 def __init__(self, downloader=None):
681 InfoExtractor.__init__(self, downloader)
683 def report_download_webpage(self, video_id):
684 """Report webpage download."""
685 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
687 def report_extraction(self, video_id):
688 """Report information extraction."""
689 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
691 def _real_extract(self, url):
692 # Extract id from URL
693 mobj = re.match(self._VALID_URL, url)
695 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
698 video_id = mobj.group(1)
700 video_extension = 'mp4'
702 # Retrieve video webpage to extract further information
703 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
705 self.report_download_webpage(video_id)
706 webpage = urllib2.urlopen(request).read()
707 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
708 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
711 # Extract URL, uploader, and title from webpage
712 self.report_extraction(video_id)
713 mobj = re.search(r"download_url:'([^']+)'", webpage)
715 video_extension = 'flv'
716 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
718 self._downloader.trouble(u'ERROR: unable to extract media URL')
720 mediaURL = urllib.unquote(mobj.group(1))
721 mediaURL = mediaURL.replace('\\x3d', '\x3d')
722 mediaURL = mediaURL.replace('\\x26', '\x26')
726 mobj = re.search(r'<title>(.*)</title>', webpage)
728 self._downloader.trouble(u'ERROR: unable to extract title')
730 video_title = mobj.group(1).decode('utf-8')
732 # Extract video description
733 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
735 self._downloader.trouble(u'ERROR: unable to extract video description')
737 video_description = mobj.group(1).decode('utf-8')
738 if not video_description:
739 video_description = 'No description available.'
741 # Extract video thumbnail
742 if self._downloader.params.get('forcethumbnail', False):
743 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
745 webpage = urllib2.urlopen(request).read()
746 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
747 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
749 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
751 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
753 video_thumbnail = mobj.group(1)
754 else: # we need something to pass to process_info
758 'id': video_id.decode('utf-8'),
759 'url': video_url.decode('utf-8'),
761 'upload_date': u'NA',
762 'title': video_title,
763 'ext': video_extension.decode('utf-8'),
769 class PhotobucketIE(InfoExtractor):
770 """Information extractor for photobucket.com."""
772 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
773 IE_NAME = u'photobucket'
775 def __init__(self, downloader=None):
776 InfoExtractor.__init__(self, downloader)
778 def report_download_webpage(self, video_id):
779 """Report webpage download."""
780 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
782 def report_extraction(self, video_id):
783 """Report information extraction."""
784 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
786 def _real_extract(self, url):
787 # Extract id from URL
788 mobj = re.match(self._VALID_URL, url)
790 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
793 video_id = mobj.group(1)
795 video_extension = 'flv'
797 # Retrieve video webpage to extract further information
798 request = urllib2.Request(url)
800 self.report_download_webpage(video_id)
801 webpage = urllib2.urlopen(request).read()
802 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
803 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
806 # Extract URL, uploader, and title from webpage
807 self.report_extraction(video_id)
808 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
810 self._downloader.trouble(u'ERROR: unable to extract media URL')
812 mediaURL = urllib.unquote(mobj.group(1))
816 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
818 self._downloader.trouble(u'ERROR: unable to extract title')
820 video_title = mobj.group(1).decode('utf-8')
822 video_uploader = mobj.group(2).decode('utf-8')
825 'id': video_id.decode('utf-8'),
826 'url': video_url.decode('utf-8'),
827 'uploader': video_uploader,
828 'upload_date': u'NA',
829 'title': video_title,
830 'ext': video_extension.decode('utf-8'),
836 class YahooIE(InfoExtractor):
837 """Information extractor for video.yahoo.com."""
839 # _VALID_URL matches all Yahoo! Video URLs
840 # _VPAGE_URL matches only the extractable '/watch/' URLs
841 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
842 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
843 IE_NAME = u'video.yahoo'
845 def __init__(self, downloader=None):
846 InfoExtractor.__init__(self, downloader)
848 def report_download_webpage(self, video_id):
849 """Report webpage download."""
850 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
852 def report_extraction(self, video_id):
853 """Report information extraction."""
854 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
856 def _real_extract(self, url, new_video=True):
857 # Extract ID from URL
858 mobj = re.match(self._VALID_URL, url)
860 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
863 video_id = mobj.group(2)
864 video_extension = 'flv'
866 # Rewrite valid but non-extractable URLs as
867 # extractable English language /watch/ URLs
868 if re.match(self._VPAGE_URL, url) is None:
869 request = urllib2.Request(url)
871 webpage = urllib2.urlopen(request).read()
872 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
873 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
876 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
878 self._downloader.trouble(u'ERROR: Unable to extract id field')
880 yahoo_id = mobj.group(1)
882 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
884 self._downloader.trouble(u'ERROR: Unable to extract vid field')
886 yahoo_vid = mobj.group(1)
888 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
889 return self._real_extract(url, new_video=False)
891 # Retrieve video webpage to extract further information
892 request = urllib2.Request(url)
894 self.report_download_webpage(video_id)
895 webpage = urllib2.urlopen(request).read()
896 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
897 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
900 # Extract uploader and title from webpage
901 self.report_extraction(video_id)
902 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
904 self._downloader.trouble(u'ERROR: unable to extract video title')
906 video_title = mobj.group(1).decode('utf-8')
908 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
910 self._downloader.trouble(u'ERROR: unable to extract video uploader')
912 video_uploader = mobj.group(1).decode('utf-8')
914 # Extract video thumbnail
915 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
917 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
919 video_thumbnail = mobj.group(1).decode('utf-8')
921 # Extract video description
922 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
924 self._downloader.trouble(u'ERROR: unable to extract video description')
926 video_description = mobj.group(1).decode('utf-8')
927 if not video_description:
928 video_description = 'No description available.'
930 # Extract video height and width
931 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
933 self._downloader.trouble(u'ERROR: unable to extract video height')
935 yv_video_height = mobj.group(1)
937 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
939 self._downloader.trouble(u'ERROR: unable to extract video width')
941 yv_video_width = mobj.group(1)
943 # Retrieve video playlist to extract media URL
944 # I'm not completely sure what all these options are, but we
945 # seem to need most of them, otherwise the server sends a 401.
946 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
947 yv_bitrate = '700' # according to Wikipedia this is hard-coded
948 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
949 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
950 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
952 self.report_download_webpage(video_id)
953 webpage = urllib2.urlopen(request).read()
954 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
955 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
958 # Extract media URL from playlist XML
959 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
961 self._downloader.trouble(u'ERROR: Unable to extract media URL')
963 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
964 video_url = unescapeHTML(video_url)
967 'id': video_id.decode('utf-8'),
969 'uploader': video_uploader,
970 'upload_date': u'NA',
971 'title': video_title,
972 'ext': video_extension.decode('utf-8'),
973 'thumbnail': video_thumbnail.decode('utf-8'),
974 'description': video_description,
975 'thumbnail': video_thumbnail,
980 class VimeoIE(InfoExtractor):
981 """Information extractor for vimeo.com."""
983 # _VALID_URL matches Vimeo URLs
984 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
987 def __init__(self, downloader=None):
988 InfoExtractor.__init__(self, downloader)
990 def report_download_webpage(self, video_id):
991 """Report webpage download."""
992 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
994 def report_extraction(self, video_id):
995 """Report information extraction."""
996 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
998 def _real_extract(self, url, new_video=True):
999 # Extract ID from URL
1000 mobj = re.match(self._VALID_URL, url)
1002 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1005 video_id = mobj.group(1)
1007 # Retrieve video webpage to extract further information
1008 request = urllib2.Request(url, None, std_headers)
1010 self.report_download_webpage(video_id)
1011 webpage = urllib2.urlopen(request).read()
1012 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1013 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1016 # Now we begin extracting as much information as we can from what we
1017 # retrieved. First we extract the information common to all extractors,
1018 # and latter we extract those that are Vimeo specific.
1019 self.report_extraction(video_id)
1021 # Extract the config JSON
1022 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1024 config = json.loads(config)
1026 self._downloader.trouble(u'ERROR: unable to extract info section')
1030 video_title = config["video"]["title"]
1033 video_uploader = config["video"]["owner"]["name"]
1035 # Extract video thumbnail
1036 video_thumbnail = config["video"]["thumbnail"]
1038 # Extract video description
1039 video_description = get_element_by_id("description", webpage.decode('utf8'))
1040 if video_description: video_description = clean_html(video_description)
1041 else: video_description = ''
1043 # Extract upload date
1044 video_upload_date = u'NA'
1045 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1046 if mobj is not None:
1047 video_upload_date = mobj.group(1)
1049 # Vimeo specific: extract request signature and timestamp
1050 sig = config['request']['signature']
1051 timestamp = config['request']['timestamp']
1053 # Vimeo specific: extract video codec and quality information
1054 # TODO bind to format param
1055 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1056 for codec in codecs:
1057 if codec[0] in config["video"]["files"]:
1058 video_codec = codec[0]
1059 video_extension = codec[1]
1060 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1061 else: quality = 'sd'
1064 self._downloader.trouble(u'ERROR: no known codec found')
1067 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1068 %(video_id, sig, timestamp, quality, video_codec.upper())
1073 'uploader': video_uploader,
1074 'upload_date': video_upload_date,
1075 'title': video_title,
1076 'ext': video_extension,
1077 'thumbnail': video_thumbnail,
1078 'description': video_description,
1083 class GenericIE(InfoExtractor):
1084 """Generic last-resort information extractor."""
1087 IE_NAME = u'generic'
1089 def __init__(self, downloader=None):
1090 InfoExtractor.__init__(self, downloader)
1092 def report_download_webpage(self, video_id):
1093 """Report webpage download."""
1094 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1095 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1097 def report_extraction(self, video_id):
1098 """Report information extraction."""
1099 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1101 def report_following_redirect(self, new_url):
1102 """Report information extraction."""
1103 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1105 def _test_redirect(self, url):
1106 """Check if it is a redirect, like url shorteners, in case restart chain."""
1107 class HeadRequest(urllib2.Request):
1108 def get_method(self):
1111 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1113 Subclass the HTTPRedirectHandler to make it use our
1114 HeadRequest also on the redirected URL
1116 def redirect_request(self, req, fp, code, msg, headers, newurl):
1117 if code in (301, 302, 303, 307):
1118 newurl = newurl.replace(' ', '%20')
1119 newheaders = dict((k,v) for k,v in req.headers.items()
1120 if k.lower() not in ("content-length", "content-type"))
1121 return HeadRequest(newurl,
1123 origin_req_host=req.get_origin_req_host(),
1126 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1128 class HTTPMethodFallback(urllib2.BaseHandler):
1130 Fallback to GET if HEAD is not allowed (405 HTTP error)
1132 def http_error_405(self, req, fp, code, msg, headers):
1136 newheaders = dict((k,v) for k,v in req.headers.items()
1137 if k.lower() not in ("content-length", "content-type"))
1138 return self.parent.open(urllib2.Request(req.get_full_url(),
1140 origin_req_host=req.get_origin_req_host(),
1144 opener = urllib2.OpenerDirector()
1145 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1146 HTTPMethodFallback, HEADRedirectHandler,
1147 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1148 opener.add_handler(handler())
1150 response = opener.open(HeadRequest(url))
1151 new_url = response.geturl()
1153 if url == new_url: return False
1155 self.report_following_redirect(new_url)
1156 self._downloader.download([new_url])
1159 def _real_extract(self, url):
1160 if self._test_redirect(url): return
1162 video_id = url.split('/')[-1]
1163 request = urllib2.Request(url)
1165 self.report_download_webpage(video_id)
1166 webpage = urllib2.urlopen(request).read()
1167 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1168 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1170 except ValueError, err:
1171 # since this is the last-resort InfoExtractor, if
1172 # this error is thrown, it'll be thrown here
1173 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176 self.report_extraction(video_id)
1177 # Start with something easy: JW Player in SWFObject
1178 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1180 # Broaden the search a little bit
1181 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1183 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1186 # It's possible that one of the regexes
1187 # matched, but returned an empty group:
1188 if mobj.group(1) is None:
1189 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1192 video_url = urllib.unquote(mobj.group(1))
1193 video_id = os.path.basename(video_url)
1195 # here's a fun little line of code for you:
1196 video_extension = os.path.splitext(video_id)[1][1:]
1197 video_id = os.path.splitext(video_id)[0]
1199 # it's tempting to parse this further, but you would
1200 # have to take into account all the variations like
1201 # Video Title - Site Name
1202 # Site Name | Video Title
1203 # Video Title - Tagline | Site Name
1204 # and so on and so forth; it's just not practical
1205 mobj = re.search(r'<title>(.*)</title>', webpage)
1207 self._downloader.trouble(u'ERROR: unable to extract title')
1209 video_title = mobj.group(1).decode('utf-8')
1211 # video uploader is domain name
1212 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1214 self._downloader.trouble(u'ERROR: unable to extract title')
1216 video_uploader = mobj.group(1).decode('utf-8')
1219 'id': video_id.decode('utf-8'),
1220 'url': video_url.decode('utf-8'),
1221 'uploader': video_uploader,
1222 'upload_date': u'NA',
1223 'title': video_title,
1224 'ext': video_extension.decode('utf-8'),
1230 class YoutubeSearchIE(InfoExtractor):
1231 """Information Extractor for YouTube search queries."""
1232 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1233 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1234 _max_youtube_results = 1000
1235 IE_NAME = u'youtube:search'
1237 def __init__(self, downloader=None):
1238 InfoExtractor.__init__(self, downloader)
1240 def report_download_page(self, query, pagenum):
1241 """Report attempt to download search page with given number."""
1242 query = query.decode(preferredencoding())
1243 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1245 def _real_extract(self, query):
1246 mobj = re.match(self._VALID_URL, query)
1248 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1251 prefix, query = query.split(':')
1253 query = query.encode('utf-8')
1255 self._download_n_results(query, 1)
1257 elif prefix == 'all':
1258 self._download_n_results(query, self._max_youtube_results)
1264 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1266 elif n > self._max_youtube_results:
1267 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1268 n = self._max_youtube_results
1269 self._download_n_results(query, n)
1271 except ValueError: # parsing prefix as integer fails
1272 self._download_n_results(query, 1)
1275 def _download_n_results(self, query, n):
1276 """Downloads a specified number of results for a query"""
1282 while (50 * pagenum) < limit:
1283 self.report_download_page(query, pagenum+1)
1284 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1285 request = urllib2.Request(result_url)
1287 data = urllib2.urlopen(request).read()
1288 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1289 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1291 api_response = json.loads(data)['data']
1293 new_ids = list(video['id'] for video in api_response['items'])
1294 video_ids += new_ids
1296 limit = min(n, api_response['totalItems'])
1299 if len(video_ids) > n:
1300 video_ids = video_ids[:n]
1301 for id in video_ids:
1302 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1306 class GoogleSearchIE(InfoExtractor):
1307 """Information Extractor for Google Video search queries."""
1308 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1309 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1310 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1311 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1312 _max_google_results = 1000
1313 IE_NAME = u'video.google:search'
1315 def __init__(self, downloader=None):
1316 InfoExtractor.__init__(self, downloader)
1318 def report_download_page(self, query, pagenum):
1319 """Report attempt to download playlist page with given number."""
1320 query = query.decode(preferredencoding())
1321 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1323 def _real_extract(self, query):
1324 mobj = re.match(self._VALID_URL, query)
1326 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1329 prefix, query = query.split(':')
1331 query = query.encode('utf-8')
1333 self._download_n_results(query, 1)
1335 elif prefix == 'all':
1336 self._download_n_results(query, self._max_google_results)
1342 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1344 elif n > self._max_google_results:
1345 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1346 n = self._max_google_results
1347 self._download_n_results(query, n)
1349 except ValueError: # parsing prefix as integer fails
1350 self._download_n_results(query, 1)
1353 def _download_n_results(self, query, n):
1354 """Downloads a specified number of results for a query"""
1360 self.report_download_page(query, pagenum)
1361 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1362 request = urllib2.Request(result_url)
1364 page = urllib2.urlopen(request).read()
1365 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1366 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1369 # Extract video identifiers
1370 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1371 video_id = mobj.group(1)
1372 if video_id not in video_ids:
1373 video_ids.append(video_id)
1374 if len(video_ids) == n:
1375 # Specified n videos reached
1376 for id in video_ids:
1377 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1380 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1381 for id in video_ids:
1382 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1385 pagenum = pagenum + 1
1388 class YahooSearchIE(InfoExtractor):
1389 """Information Extractor for Yahoo! Video search queries."""
1390 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1391 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1392 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1393 _MORE_PAGES_INDICATOR = r'\s*Next'
1394 _max_yahoo_results = 1000
1395 IE_NAME = u'video.yahoo:search'
1397 def __init__(self, downloader=None):
1398 InfoExtractor.__init__(self, downloader)
1400 def report_download_page(self, query, pagenum):
1401 """Report attempt to download playlist page with given number."""
1402 query = query.decode(preferredencoding())
1403 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1405 def _real_extract(self, query):
1406 mobj = re.match(self._VALID_URL, query)
1408 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1411 prefix, query = query.split(':')
1413 query = query.encode('utf-8')
1415 self._download_n_results(query, 1)
1417 elif prefix == 'all':
1418 self._download_n_results(query, self._max_yahoo_results)
1424 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1426 elif n > self._max_yahoo_results:
1427 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1428 n = self._max_yahoo_results
1429 self._download_n_results(query, n)
1431 except ValueError: # parsing prefix as integer fails
1432 self._download_n_results(query, 1)
1435 def _download_n_results(self, query, n):
1436 """Downloads a specified number of results for a query"""
1439 already_seen = set()
1443 self.report_download_page(query, pagenum)
1444 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1445 request = urllib2.Request(result_url)
1447 page = urllib2.urlopen(request).read()
1448 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1449 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1452 # Extract video identifiers
1453 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1454 video_id = mobj.group(1)
1455 if video_id not in already_seen:
1456 video_ids.append(video_id)
1457 already_seen.add(video_id)
1458 if len(video_ids) == n:
1459 # Specified n videos reached
1460 for id in video_ids:
1461 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1464 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1465 for id in video_ids:
1466 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1469 pagenum = pagenum + 1
1472 class YoutubePlaylistIE(InfoExtractor):
1473 """Information Extractor for YouTube playlists."""
1475 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1476 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1477 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=.*?%s'
1478 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1479 IE_NAME = u'youtube:playlist'
1481 def __init__(self, downloader=None):
1482 InfoExtractor.__init__(self, downloader)
1484 def report_download_page(self, playlist_id, pagenum):
1485 """Report attempt to download playlist page with given number."""
1486 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1488 def _real_extract(self, url):
1489 # Extract playlist id
1490 mobj = re.match(self._VALID_URL, url)
1492 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1496 if mobj.group(3) is not None:
1497 self._downloader.download([mobj.group(3)])
1500 # Download playlist pages
1501 # prefix is 'p' as default for playlists but there are other types that need extra care
1502 playlist_prefix = mobj.group(1)
1503 if playlist_prefix == 'a':
1504 playlist_access = 'artist'
1506 playlist_prefix = 'p'
1507 playlist_access = 'view_play_list'
1508 playlist_id = mobj.group(2)
1513 self.report_download_page(playlist_id, pagenum)
1514 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1515 request = urllib2.Request(url)
1517 page = urllib2.urlopen(request).read()
1518 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1519 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1522 # Extract video identifiers
1524 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1525 if mobj.group(1) not in ids_in_page:
1526 ids_in_page.append(mobj.group(1))
1527 video_ids.extend(ids_in_page)
1529 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1531 pagenum = pagenum + 1
1533 playliststart = self._downloader.params.get('playliststart', 1) - 1
1534 playlistend = self._downloader.params.get('playlistend', -1)
1535 if playlistend == -1:
1536 video_ids = video_ids[playliststart:]
1538 video_ids = video_ids[playliststart:playlistend]
1540 for id in video_ids:
1541 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1545 class YoutubeUserIE(InfoExtractor):
1546 """Information Extractor for YouTube users."""
1548 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1549 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1550 _GDATA_PAGE_SIZE = 50
1551 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1552 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1553 IE_NAME = u'youtube:user'
1555 def __init__(self, downloader=None):
1556 InfoExtractor.__init__(self, downloader)
1558 def report_download_page(self, username, start_index):
1559 """Report attempt to download user page."""
1560 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1561 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1563 def _real_extract(self, url):
1565 mobj = re.match(self._VALID_URL, url)
1567 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1570 username = mobj.group(1)
1572 # Download video ids using YouTube Data API. Result size per
1573 # query is limited (currently to 50 videos) so we need to query
1574 # page by page until there are no video ids - it means we got
1581 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1582 self.report_download_page(username, start_index)
1584 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1587 page = urllib2.urlopen(request).read()
1588 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1589 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1592 # Extract video identifiers
1595 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1596 if mobj.group(1) not in ids_in_page:
1597 ids_in_page.append(mobj.group(1))
1599 video_ids.extend(ids_in_page)
1601 # A little optimization - if current page is not
1602 # "full", ie. does not contain PAGE_SIZE video ids then
1603 # we can assume that this page is the last one - there
1604 # are no more ids on further pages - no need to query
1607 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1612 all_ids_count = len(video_ids)
1613 playliststart = self._downloader.params.get('playliststart', 1) - 1
1614 playlistend = self._downloader.params.get('playlistend', -1)
1616 if playlistend == -1:
1617 video_ids = video_ids[playliststart:]
1619 video_ids = video_ids[playliststart:playlistend]
1621 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1622 (username, all_ids_count, len(video_ids)))
1624 for video_id in video_ids:
1625 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1628 class BlipTVUserIE(InfoExtractor):
1629 """Information Extractor for blip.tv users."""
1631 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1633 IE_NAME = u'blip.tv:user'
1635 def __init__(self, downloader=None):
1636 InfoExtractor.__init__(self, downloader)
1638 def report_download_page(self, username, pagenum):
1639 """Report attempt to download user page."""
1640 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1641 (self.IE_NAME, username, pagenum))
1643 def _real_extract(self, url):
1645 mobj = re.match(self._VALID_URL, url)
1647 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1650 username = mobj.group(1)
1652 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1654 request = urllib2.Request(url)
1657 page = urllib2.urlopen(request).read().decode('utf-8')
1658 mobj = re.search(r'data-users-id="([^"]+)"', page)
1659 page_base = page_base % mobj.group(1)
1660 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1661 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1665 # Download video ids using BlipTV Ajax calls. Result size per
1666 # query is limited (currently to 12 videos) so we need to query
1667 # page by page until there are no video ids - it means we got
1674 self.report_download_page(username, pagenum)
1676 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1679 page = urllib2.urlopen(request).read().decode('utf-8')
1680 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1681 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1684 # Extract video identifiers
1687 for mobj in re.finditer(r'href="/([^"]+)"', page):
1688 if mobj.group(1) not in ids_in_page:
1689 ids_in_page.append(unescapeHTML(mobj.group(1)))
1691 video_ids.extend(ids_in_page)
1693 # A little optimization - if current page is not
1694 # "full", ie. does not contain PAGE_SIZE video ids then
1695 # we can assume that this page is the last one - there
1696 # are no more ids on further pages - no need to query
1699 if len(ids_in_page) < self._PAGE_SIZE:
1704 all_ids_count = len(video_ids)
1705 playliststart = self._downloader.params.get('playliststart', 1) - 1
1706 playlistend = self._downloader.params.get('playlistend', -1)
1708 if playlistend == -1:
1709 video_ids = video_ids[playliststart:]
1711 video_ids = video_ids[playliststart:playlistend]
1713 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1714 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1716 for video_id in video_ids:
1717 self._downloader.download([u'http://blip.tv/'+video_id])
1720 class DepositFilesIE(InfoExtractor):
1721 """Information extractor for depositfiles.com"""
1723 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1724 IE_NAME = u'DepositFiles'
1726 def __init__(self, downloader=None):
1727 InfoExtractor.__init__(self, downloader)
1729 def report_download_webpage(self, file_id):
1730 """Report webpage download."""
1731 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1733 def report_extraction(self, file_id):
1734 """Report information extraction."""
1735 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1737 def _real_extract(self, url):
1738 file_id = url.split('/')[-1]
1739 # Rebuild url in english locale
1740 url = 'http://depositfiles.com/en/files/' + file_id
1742 # Retrieve file webpage with 'Free download' button pressed
1743 free_download_indication = { 'gateway_result' : '1' }
1744 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1746 self.report_download_webpage(file_id)
1747 webpage = urllib2.urlopen(request).read()
1748 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1749 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1752 # Search for the real file URL
1753 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1754 if (mobj is None) or (mobj.group(1) is None):
1755 # Try to figure out reason of the error.
1756 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1757 if (mobj is not None) and (mobj.group(1) is not None):
1758 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1759 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1761 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1764 file_url = mobj.group(1)
1765 file_extension = os.path.splitext(file_url)[1][1:]
1767 # Search for file title
1768 mobj = re.search(r'<b title="(.*?)">', webpage)
1770 self._downloader.trouble(u'ERROR: unable to extract title')
1772 file_title = mobj.group(1).decode('utf-8')
1775 'id': file_id.decode('utf-8'),
1776 'url': file_url.decode('utf-8'),
1778 'upload_date': u'NA',
1779 'title': file_title,
1780 'ext': file_extension.decode('utf-8'),
1786 class FacebookIE(InfoExtractor):
1787 """Information Extractor for Facebook"""
1789 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1790 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1791 _NETRC_MACHINE = 'facebook'
1792 _available_formats = ['video', 'highqual', 'lowqual']
1793 _video_extensions = {
1798 IE_NAME = u'facebook'
1800 def __init__(self, downloader=None):
1801 InfoExtractor.__init__(self, downloader)
1803 def _reporter(self, message):
1804 """Add header and report message."""
1805 self._downloader.to_screen(u'[facebook] %s' % message)
1807 def report_login(self):
1808 """Report attempt to log in."""
1809 self._reporter(u'Logging in')
1811 def report_video_webpage_download(self, video_id):
1812 """Report attempt to download video webpage."""
1813 self._reporter(u'%s: Downloading video webpage' % video_id)
1815 def report_information_extraction(self, video_id):
1816 """Report attempt to extract video information."""
1817 self._reporter(u'%s: Extracting video information' % video_id)
1819 def _parse_page(self, video_webpage):
1820 """Extract video information from page"""
1822 data = {'title': r'\("video_title", "(.*?)"\)',
1823 'description': r'<div class="datawrap">(.*?)</div>',
1824 'owner': r'\("video_owner_name", "(.*?)"\)',
1825 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1828 for piece in data.keys():
1829 mobj = re.search(data[piece], video_webpage)
1830 if mobj is not None:
1831 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1835 for fmt in self._available_formats:
1836 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1837 if mobj is not None:
1838 # URL is in a Javascript segment inside an escaped Unicode format within
1839 # the generally utf-8 page
1840 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1841 video_info['video_urls'] = video_urls
1845 def _real_initialize(self):
1846 if self._downloader is None:
1851 downloader_params = self._downloader.params
1853 # Attempt to use provided username and password or .netrc data
1854 if downloader_params.get('username', None) is not None:
1855 useremail = downloader_params['username']
1856 password = downloader_params['password']
1857 elif downloader_params.get('usenetrc', False):
1859 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1860 if info is not None:
1864 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1865 except (IOError, netrc.NetrcParseError), err:
1866 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1869 if useremail is None:
1878 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1881 login_results = urllib2.urlopen(request).read()
1882 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1883 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1885 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1886 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1889 def _real_extract(self, url):
1890 mobj = re.match(self._VALID_URL, url)
1892 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1894 video_id = mobj.group('ID')
1897 self.report_video_webpage_download(video_id)
1898 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1900 page = urllib2.urlopen(request)
1901 video_webpage = page.read()
1902 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1903 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1906 # Start extracting information
1907 self.report_information_extraction(video_id)
1909 # Extract information
1910 video_info = self._parse_page(video_webpage)
1913 if 'owner' not in video_info:
1914 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1916 video_uploader = video_info['owner']
1919 if 'title' not in video_info:
1920 self._downloader.trouble(u'ERROR: unable to extract video title')
1922 video_title = video_info['title']
1923 video_title = video_title.decode('utf-8')
1926 if 'thumbnail' not in video_info:
1927 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1928 video_thumbnail = ''
1930 video_thumbnail = video_info['thumbnail']
1934 if 'upload_date' in video_info:
1935 upload_time = video_info['upload_date']
1936 timetuple = email.utils.parsedate_tz(upload_time)
1937 if timetuple is not None:
1939 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1944 video_description = video_info.get('description', 'No description available.')
1946 url_map = video_info['video_urls']
1947 if len(url_map.keys()) > 0:
1948 # Decide which formats to download
1949 req_format = self._downloader.params.get('format', None)
1950 format_limit = self._downloader.params.get('format_limit', None)
1952 if format_limit is not None and format_limit in self._available_formats:
1953 format_list = self._available_formats[self._available_formats.index(format_limit):]
1955 format_list = self._available_formats
1956 existing_formats = [x for x in format_list if x in url_map]
1957 if len(existing_formats) == 0:
1958 self._downloader.trouble(u'ERROR: no known formats available for video')
1960 if req_format is None:
1961 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1962 elif req_format == 'worst':
1963 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1964 elif req_format == '-1':
1965 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1968 if req_format not in url_map:
1969 self._downloader.trouble(u'ERROR: requested format not available')
1971 video_url_list = [(req_format, url_map[req_format])] # Specific format
1974 for format_param, video_real_url in video_url_list:
1976 video_extension = self._video_extensions.get(format_param, 'mp4')
1979 'id': video_id.decode('utf-8'),
1980 'url': video_real_url.decode('utf-8'),
1981 'uploader': video_uploader.decode('utf-8'),
1982 'upload_date': upload_date,
1983 'title': video_title,
1984 'ext': video_extension.decode('utf-8'),
1985 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1986 'thumbnail': video_thumbnail.decode('utf-8'),
1987 'description': video_description.decode('utf-8'),
1992 class BlipTVIE(InfoExtractor):
1993 """Information extractor for blip.tv"""
1995 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1996 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1997 IE_NAME = u'blip.tv'
1999 def report_extraction(self, file_id):
2000 """Report information extraction."""
2001 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2003 def report_direct_download(self, title):
2004 """Report information extraction."""
2005 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2007 def _real_extract(self, url):
2008 mobj = re.match(self._VALID_URL, url)
2010 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2017 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2018 request = urllib2.Request(json_url.encode('utf-8'))
2019 self.report_extraction(mobj.group(1))
2022 urlh = urllib2.urlopen(request)
2023 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2024 basename = url.split('/')[-1]
2025 title,ext = os.path.splitext(basename)
2026 title = title.decode('UTF-8')
2027 ext = ext.replace('.', '')
2028 self.report_direct_download(title)
2036 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2037 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2039 if info is None: # Regular URL
2041 json_code = urlh.read()
2042 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2043 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2047 json_data = json.loads(json_code)
2048 if 'Post' in json_data:
2049 data = json_data['Post']
2053 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2054 video_url = data['media']['url']
2055 umobj = re.match(self._URL_EXT, video_url)
2057 raise ValueError('Can not determine filename extension')
2058 ext = umobj.group(1)
2061 'id': data['item_id'],
2063 'uploader': data['display_name'],
2064 'upload_date': upload_date,
2065 'title': data['title'],
2067 'format': data['media']['mimeType'],
2068 'thumbnail': data['thumbnailUrl'],
2069 'description': data['description'],
2070 'player_url': data['embedUrl']
2072 except (ValueError,KeyError), err:
2073 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2076 std_headers['User-Agent'] = 'iTunes/10.6.1'
2080 class MyVideoIE(InfoExtractor):
2081 """Information Extractor for myvideo.de."""
2083 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2084 IE_NAME = u'myvideo'
2086 def __init__(self, downloader=None):
2087 InfoExtractor.__init__(self, downloader)
2089 def report_download_webpage(self, video_id):
2090 """Report webpage download."""
2091 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2093 def report_extraction(self, video_id):
2094 """Report information extraction."""
2095 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2097 def _real_extract(self,url):
2098 mobj = re.match(self._VALID_URL, url)
2100 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2103 video_id = mobj.group(1)
2106 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2108 self.report_download_webpage(video_id)
2109 webpage = urllib2.urlopen(request).read()
2110 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2111 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2114 self.report_extraction(video_id)
2115 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2118 self._downloader.trouble(u'ERROR: unable to extract media URL')
2120 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2122 mobj = re.search('<title>([^<]+)</title>', webpage)
2124 self._downloader.trouble(u'ERROR: unable to extract title')
2127 video_title = mobj.group(1)
2133 'upload_date': u'NA',
2134 'title': video_title,
2140 class ComedyCentralIE(InfoExtractor):
2141 """Information extractor for The Daily Show and Colbert Report """
2143 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2144 IE_NAME = u'comedycentral'
2146 def report_extraction(self, episode_id):
2147 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2149 def report_config_download(self, episode_id):
2150 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2152 def report_index_download(self, episode_id):
2153 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2155 def report_player_url(self, episode_id):
2156 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2158 def _real_extract(self, url):
2159 mobj = re.match(self._VALID_URL, url)
2161 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2164 if mobj.group('shortname'):
2165 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2166 url = u'http://www.thedailyshow.com/full-episodes/'
2168 url = u'http://www.colbertnation.com/full-episodes/'
2169 mobj = re.match(self._VALID_URL, url)
2170 assert mobj is not None
2172 dlNewest = not mobj.group('episode')
2174 epTitle = mobj.group('showname')
2176 epTitle = mobj.group('episode')
2178 req = urllib2.Request(url)
2179 self.report_extraction(epTitle)
2181 htmlHandle = urllib2.urlopen(req)
2182 html = htmlHandle.read()
2183 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2184 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2187 url = htmlHandle.geturl()
2188 mobj = re.match(self._VALID_URL, url)
2190 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2192 if mobj.group('episode') == '':
2193 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2195 epTitle = mobj.group('episode')
2197 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2198 if len(mMovieParams) == 0:
2199 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2202 playerUrl_raw = mMovieParams[0][0]
2203 self.report_player_url(epTitle)
2205 urlHandle = urllib2.urlopen(playerUrl_raw)
2206 playerUrl = urlHandle.geturl()
2207 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2208 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2211 uri = mMovieParams[0][1]
2212 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2213 self.report_index_download(epTitle)
2215 indexXml = urllib2.urlopen(indexUrl).read()
2216 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2217 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2222 idoc = xml.etree.ElementTree.fromstring(indexXml)
2223 itemEls = idoc.findall('.//item')
2224 for itemEl in itemEls:
2225 mediaId = itemEl.findall('./guid')[0].text
2226 shortMediaId = mediaId.split(':')[-1]
2227 showId = mediaId.split(':')[-2].replace('.com', '')
2228 officialTitle = itemEl.findall('./title')[0].text
2229 officialDate = itemEl.findall('./pubDate')[0].text
2231 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2232 urllib.urlencode({'uri': mediaId}))
2233 configReq = urllib2.Request(configUrl)
2234 self.report_config_download(epTitle)
2236 configXml = urllib2.urlopen(configReq).read()
2237 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2238 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2241 cdoc = xml.etree.ElementTree.fromstring(configXml)
2243 for rendition in cdoc.findall('.//rendition'):
2244 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2248 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2251 # For now, just pick the highest bitrate
2252 format,video_url = turls[-1]
2254 effTitle = showId + u'-' + epTitle
2259 'upload_date': officialDate,
2264 'description': officialTitle,
2265 'player_url': playerUrl
2268 results.append(info)
2273 class EscapistIE(InfoExtractor):
2274 """Information extractor for The Escapist """
2276 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2277 IE_NAME = u'escapist'
2279 def report_extraction(self, showName):
2280 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2282 def report_config_download(self, showName):
2283 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2285 def _real_extract(self, url):
2286 mobj = re.match(self._VALID_URL, url)
2288 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2290 showName = mobj.group('showname')
2291 videoId = mobj.group('episode')
2293 self.report_extraction(showName)
2295 webPage = urllib2.urlopen(url)
2296 webPageBytes = webPage.read()
2297 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2298 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2299 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2300 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2303 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2304 description = unescapeHTML(descMatch.group(1))
2305 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2306 imgUrl = unescapeHTML(imgMatch.group(1))
2307 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2308 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2309 configUrlMatch = re.search('config=(.*)$', playerUrl)
2310 configUrl = urllib2.unquote(configUrlMatch.group(1))
2312 self.report_config_download(showName)
2314 configJSON = urllib2.urlopen(configUrl).read()
2315 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2316 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2319 # Technically, it's JavaScript, not JSON
2320 configJSON = configJSON.replace("'", '"')
2323 config = json.loads(configJSON)
2324 except (ValueError,), err:
2325 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2328 playlist = config['playlist']
2329 videoUrl = playlist[1]['url']
2334 'uploader': showName,
2335 'upload_date': None,
2339 'thumbnail': imgUrl,
2340 'description': description,
2341 'player_url': playerUrl,
2347 class CollegeHumorIE(InfoExtractor):
2348 """Information extractor for collegehumor.com"""
2350 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2351 IE_NAME = u'collegehumor'
2353 def report_webpage(self, video_id):
2354 """Report information extraction."""
2355 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2357 def report_extraction(self, video_id):
2358 """Report information extraction."""
2359 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2361 def _real_extract(self, url):
2362 mobj = re.match(self._VALID_URL, url)
2364 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2366 video_id = mobj.group('videoid')
2368 self.report_webpage(video_id)
2369 request = urllib2.Request(url)
2371 webpage = urllib2.urlopen(request).read()
2372 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2373 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2376 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2378 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2380 internal_video_id = m.group('internalvideoid')
2384 'internal_id': internal_video_id,
2387 self.report_extraction(video_id)
2388 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2390 metaXml = urllib2.urlopen(xmlUrl).read()
2391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2392 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2395 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2397 videoNode = mdoc.findall('./video')[0]
2398 info['description'] = videoNode.findall('./description')[0].text
2399 info['title'] = videoNode.findall('./caption')[0].text
2400 info['url'] = videoNode.findall('./file')[0].text
2401 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2402 info['ext'] = info['url'].rpartition('.')[2]
2403 info['format'] = info['ext']
2405 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2411 class XVideosIE(InfoExtractor):
2412 """Information extractor for xvideos.com"""
2414 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2415 IE_NAME = u'xvideos'
2417 def report_webpage(self, video_id):
2418 """Report information extraction."""
2419 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2421 def report_extraction(self, video_id):
2422 """Report information extraction."""
2423 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2425 def _real_extract(self, url):
2426 mobj = re.match(self._VALID_URL, url)
2428 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2430 video_id = mobj.group(1).decode('utf-8')
2432 self.report_webpage(video_id)
2434 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2436 webpage = urllib2.urlopen(request).read()
2437 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2438 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2441 self.report_extraction(video_id)
2445 mobj = re.search(r'flv_url=(.+?)&', webpage)
2447 self._downloader.trouble(u'ERROR: unable to extract video url')
2449 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2453 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2455 self._downloader.trouble(u'ERROR: unable to extract video title')
2457 video_title = mobj.group(1).decode('utf-8')
2460 # Extract video thumbnail
2461 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2463 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2465 video_thumbnail = mobj.group(0).decode('utf-8')
2471 'upload_date': None,
2472 'title': video_title,
2475 'thumbnail': video_thumbnail,
2476 'description': None,
2483 class SoundcloudIE(InfoExtractor):
2484 """Information extractor for soundcloud.com
2485 To access the media, the uid of the song and a stream token
2486 must be extracted from the page source and the script must make
2487 a request to media.soundcloud.com/crossdomain.xml. Then
2488 the media can be grabbed by requesting from an url composed
2489 of the stream token and uid
2492 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2493 IE_NAME = u'soundcloud'
2495 def __init__(self, downloader=None):
2496 InfoExtractor.__init__(self, downloader)
2498 def report_webpage(self, video_id):
2499 """Report information extraction."""
2500 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2502 def report_extraction(self, video_id):
2503 """Report information extraction."""
2504 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2506 def _real_extract(self, url):
2507 mobj = re.match(self._VALID_URL, url)
2509 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2512 # extract uploader (which is in the url)
2513 uploader = mobj.group(1).decode('utf-8')
2514 # extract simple title (uploader + slug of song title)
2515 slug_title = mobj.group(2).decode('utf-8')
2516 simple_title = uploader + u'-' + slug_title
2518 self.report_webpage('%s/%s' % (uploader, slug_title))
2520 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2522 webpage = urllib2.urlopen(request).read()
2523 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2524 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2527 self.report_extraction('%s/%s' % (uploader, slug_title))
2529 # extract uid and stream token that soundcloud hands out for access
2530 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2532 video_id = mobj.group(1)
2533 stream_token = mobj.group(2)
2535 # extract unsimplified title
2536 mobj = re.search('"title":"(.*?)",', webpage)
2538 title = mobj.group(1).decode('utf-8')
2540 title = simple_title
2542 # construct media url (with uid/token)
2543 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2544 mediaURL = mediaURL % (video_id, stream_token)
2547 description = u'No description available'
2548 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2550 description = mobj.group(1)
2554 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2557 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2558 except Exception, e:
2559 self._downloader.to_stderr(str(e))
2561 # for soundcloud, a request to a cross domain is required for cookies
2562 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2565 'id': video_id.decode('utf-8'),
2567 'uploader': uploader.decode('utf-8'),
2568 'upload_date': upload_date,
2573 'description': description.decode('utf-8')
2577 class InfoQIE(InfoExtractor):
2578 """Information extractor for infoq.com"""
2580 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2583 def report_webpage(self, video_id):
2584 """Report information extraction."""
2585 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2587 def report_extraction(self, video_id):
2588 """Report information extraction."""
2589 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2591 def _real_extract(self, url):
2592 mobj = re.match(self._VALID_URL, url)
2594 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2597 self.report_webpage(url)
2599 request = urllib2.Request(url)
2601 webpage = urllib2.urlopen(request).read()
2602 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2603 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2606 self.report_extraction(url)
2610 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2612 self._downloader.trouble(u'ERROR: unable to extract video url')
2614 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2618 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2620 self._downloader.trouble(u'ERROR: unable to extract video title')
2622 video_title = mobj.group(1).decode('utf-8')
2624 # Extract description
2625 video_description = u'No description available.'
2626 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2627 if mobj is not None:
2628 video_description = mobj.group(1).decode('utf-8')
2630 video_filename = video_url.split('/')[-1]
2631 video_id, extension = video_filename.split('.')
2637 'upload_date': None,
2638 'title': video_title,
2640 'format': extension, # Extension is always(?) mp4, but seems to be flv
2642 'description': video_description,
2648 class MixcloudIE(InfoExtractor):
2649 """Information extractor for www.mixcloud.com"""
2650 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2651 IE_NAME = u'mixcloud'
2653 def __init__(self, downloader=None):
2654 InfoExtractor.__init__(self, downloader)
2656 def report_download_json(self, file_id):
2657 """Report JSON download."""
2658 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2660 def report_extraction(self, file_id):
2661 """Report information extraction."""
2662 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2664 def get_urls(self, jsonData, fmt, bitrate='best'):
2665 """Get urls from 'audio_formats' section in json"""
2668 bitrate_list = jsonData[fmt]
2669 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2670 bitrate = max(bitrate_list) # select highest
2672 url_list = jsonData[fmt][bitrate]
2673 except TypeError: # we have no bitrate info.
2674 url_list = jsonData[fmt]
2677 def check_urls(self, url_list):
2678 """Returns 1st active url from list"""
2679 for url in url_list:
2681 urllib2.urlopen(url)
2683 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2688 def _print_formats(self, formats):
2689 print 'Available formats:'
2690 for fmt in formats.keys():
2691 for b in formats[fmt]:
2693 ext = formats[fmt][b][0]
2694 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2695 except TypeError: # we have no bitrate info
2696 ext = formats[fmt][0]
2697 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2700 def _real_extract(self, url):
2701 mobj = re.match(self._VALID_URL, url)
2703 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2705 # extract uploader & filename from url
2706 uploader = mobj.group(1).decode('utf-8')
2707 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2709 # construct API request
2710 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2711 # retrieve .json file with links to files
2712 request = urllib2.Request(file_url)
2714 self.report_download_json(file_url)
2715 jsonData = urllib2.urlopen(request).read()
2716 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2717 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2721 json_data = json.loads(jsonData)
2722 player_url = json_data['player_swf_url']
2723 formats = dict(json_data['audio_formats'])
2725 req_format = self._downloader.params.get('format', None)
2728 if self._downloader.params.get('listformats', None):
2729 self._print_formats(formats)
2732 if req_format is None or req_format == 'best':
2733 for format_param in formats.keys():
2734 url_list = self.get_urls(formats, format_param)
2736 file_url = self.check_urls(url_list)
2737 if file_url is not None:
2740 if req_format not in formats.keys():
2741 self._downloader.trouble(u'ERROR: format is not available')
2744 url_list = self.get_urls(formats, req_format)
2745 file_url = self.check_urls(url_list)
2746 format_param = req_format
2749 'id': file_id.decode('utf-8'),
2750 'url': file_url.decode('utf-8'),
2751 'uploader': uploader.decode('utf-8'),
2752 'upload_date': u'NA',
2753 'title': json_data['name'],
2754 'ext': file_url.split('.')[-1].decode('utf-8'),
2755 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2756 'thumbnail': json_data['thumbnail_url'],
2757 'description': json_data['description'],
2758 'player_url': player_url.decode('utf-8'),
2761 class StanfordOpenClassroomIE(InfoExtractor):
2762 """Information extractor for Stanford's Open ClassRoom"""
2764 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2765 IE_NAME = u'stanfordoc'
2767 def report_download_webpage(self, objid):
2768 """Report information extraction."""
2769 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2771 def report_extraction(self, video_id):
2772 """Report information extraction."""
2773 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2775 def _real_extract(self, url):
2776 mobj = re.match(self._VALID_URL, url)
2778 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2781 if mobj.group('course') and mobj.group('video'): # A specific video
2782 course = mobj.group('course')
2783 video = mobj.group('video')
2785 'id': course + '_' + video,
2788 self.report_extraction(info['id'])
2789 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2790 xmlUrl = baseUrl + video + '.xml'
2792 metaXml = urllib2.urlopen(xmlUrl).read()
2793 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2794 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2796 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2798 info['title'] = mdoc.findall('./title')[0].text
2799 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2801 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2803 info['ext'] = info['url'].rpartition('.')[2]
2804 info['format'] = info['ext']
2806 elif mobj.group('course'): # A course page
2807 course = mobj.group('course')
2813 self.report_download_webpage(info['id'])
2815 coursepage = urllib2.urlopen(url).read()
2816 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2817 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2820 m = re.search('<h1>([^<]+)</h1>', coursepage)
2822 info['title'] = unescapeHTML(m.group(1))
2824 info['title'] = info['id']
2826 m = re.search('<description>([^<]+)</description>', coursepage)
2828 info['description'] = unescapeHTML(m.group(1))
2830 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2833 'type': 'reference',
2834 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2838 for entry in info['list']:
2839 assert entry['type'] == 'reference'
2840 results += self.extract(entry['url'])
2845 'id': 'Stanford OpenClassroom',
2849 self.report_download_webpage(info['id'])
2850 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2852 rootpage = urllib2.urlopen(rootURL).read()
2853 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2854 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2857 info['title'] = info['id']
2859 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2862 'type': 'reference',
2863 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2868 for entry in info['list']:
2869 assert entry['type'] == 'reference'
2870 results += self.extract(entry['url'])
2873 class MTVIE(InfoExtractor):
2874 """Information extractor for MTV.com"""
2876 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2879 def report_webpage(self, video_id):
2880 """Report information extraction."""
2881 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2883 def report_extraction(self, video_id):
2884 """Report information extraction."""
2885 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2887 def _real_extract(self, url):
2888 mobj = re.match(self._VALID_URL, url)
2890 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2892 if not mobj.group('proto'):
2893 url = 'http://' + url
2894 video_id = mobj.group('videoid')
2895 self.report_webpage(video_id)
2897 request = urllib2.Request(url)
2899 webpage = urllib2.urlopen(request).read()
2900 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2901 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2904 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2906 self._downloader.trouble(u'ERROR: unable to extract song name')
2908 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2909 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2911 self._downloader.trouble(u'ERROR: unable to extract performer')
2913 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2914 video_title = performer + ' - ' + song_name
2916 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2918 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2920 mtvn_uri = mobj.group(1)
2922 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2924 self._downloader.trouble(u'ERROR: unable to extract content id')
2926 content_id = mobj.group(1)
2928 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2929 self.report_extraction(video_id)
2930 request = urllib2.Request(videogen_url)
2932 metadataXml = urllib2.urlopen(request).read()
2933 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2934 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2937 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2938 renditions = mdoc.findall('.//rendition')
2940 # For now, always pick the highest quality.
2941 rendition = renditions[-1]
2944 _,_,ext = rendition.attrib['type'].partition('/')
2945 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2946 video_url = rendition.find('./src').text
2948 self._downloader.trouble('Invalid rendition field.')
2954 'uploader': performer,
2955 'title': video_title,
2963 class YoukuIE(InfoExtractor):
2965 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2968 def __init__(self, downloader=None):
2969 InfoExtractor.__init__(self, downloader)
2971 def report_download_webpage(self, file_id):
2972 """Report webpage download."""
2973 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
2975 def report_extraction(self, file_id):
2976 """Report information extraction."""
2977 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
2980 nowTime = int(time.time() * 1000)
2981 random1 = random.randint(1000,1998)
2982 random2 = random.randint(1000,9999)
2984 return "%d%d%d" %(nowTime,random1,random2)
2986 def _get_file_ID_mix_string(self, seed):
2988 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2990 for i in range(len(source)):
2991 seed = (seed * 211 + 30031 ) % 65536
2992 index = math.floor(seed / 65536 * len(source) )
2993 mixed.append(source[int(index)])
2994 source.remove(source[int(index)])
2995 #return ''.join(mixed)
2998 def _get_file_id(self, fileId, seed):
2999 mixed = self._get_file_ID_mix_string(seed)
3000 ids = fileId.split('*')
3004 realId.append(mixed[int(ch)])
3005 return ''.join(realId)
3007 def _real_extract(self, url):
3008 mobj = re.match(self._VALID_URL, url)
3010 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3012 video_id = mobj.group('ID')
3014 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3016 request = urllib2.Request(info_url, None, std_headers)
3018 self.report_download_webpage(video_id)
3019 jsondata = urllib2.urlopen(request).read()
3020 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3021 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3024 self.report_extraction(video_id)
3026 config = json.loads(jsondata)
3028 video_title = config['data'][0]['title']
3029 seed = config['data'][0]['seed']
3031 format = self._downloader.params.get('format', None)
3032 supported_format = config['data'][0]['streamfileids'].keys()
3034 if format is None or format == 'best':
3035 if 'hd2' in supported_format:
3040 elif format == 'worst':
3048 fileid = config['data'][0]['streamfileids'][format]
3049 seg_number = len(config['data'][0]['segs'][format])
3052 for i in xrange(seg_number):
3053 keys.append(config['data'][0]['segs'][format][i]['k'])
3056 #youku only could be viewed from mainland china
3058 self._downloader.trouble(u'ERROR: unable to extract info section')
3062 sid = self._gen_sid()
3063 fileid = self._get_file_id(fileid, seed)
3065 #column 8,9 of fileid represent the segment number
3066 #fileid[7:9] should be changed
3067 for index, key in enumerate(keys):
3069 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3070 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3073 'id': '%s_part%02d' % (video_id, index),
3074 'url': download_url,
3076 'title': video_title,
3080 files_info.append(info)
3085 class XNXXIE(InfoExtractor):
3086 """Information extractor for xnxx.com"""
3088 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3090 VIDEO_URL_RE = r'flv_url=(.*?)&'
3091 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3092 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3094 def report_webpage(self, video_id):
3095 """Report information extraction"""
3096 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3098 def report_extraction(self, video_id):
3099 """Report information extraction"""
3100 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3102 def _real_extract(self, url):
3103 mobj = re.match(self._VALID_URL, url)
3105 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3107 video_id = mobj.group(1).decode('utf-8')
3109 self.report_webpage(video_id)
3111 # Get webpage content
3113 webpage = urllib2.urlopen(url).read()
3114 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3115 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3118 result = re.search(self.VIDEO_URL_RE, webpage)
3120 self._downloader.trouble(u'ERROR: unable to extract video url')
3122 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3124 result = re.search(self.VIDEO_TITLE_RE, webpage)
3126 self._downloader.trouble(u'ERROR: unable to extract video title')
3128 video_title = result.group(1).decode('utf-8')
3130 result = re.search(self.VIDEO_THUMB_RE, webpage)
3132 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3134 video_thumbnail = result.group(1).decode('utf-8')
3136 info = {'id': video_id,
3139 'upload_date': None,
3140 'title': video_title,
3143 'thumbnail': video_thumbnail,
3144 'description': None,