2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
39 The dictionaries must include the following fields:
43 uploader: Nickname of the video uploader, unescaped.
44 upload_date: Video upload date (YYYYMMDD).
45 title: Video title, unescaped.
46 ext: Video filename extension.
48 The following fields are optional:
50 format: The video format, defaults to ext (used for --get-format)
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The .srt file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib2.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
80 def suitable(self, url):
81 """Receives a URL and returns True if suitable for this IE."""
82 return re.match(self._VALID_URL, url) is not None
85 """Getter method for _WORKING."""
89 """Initializes an instance (authentication, etc)."""
91 self._real_initialize()
94 def extract(self, url):
95 """Extracts URL information and returns it in list of dicts."""
97 return self._real_extract(url)
99 def set_downloader(self, downloader):
100 """Sets the downloader for this IE."""
101 self._downloader = downloader
103 def _real_initialize(self):
104 """Real initialization process. Redefine in subclasses."""
107 def _real_extract(self, url):
108 """Real extraction process. Redefine in subclasses."""
112 class YoutubeIE(InfoExtractor):
113 """Information extractor for youtube.com."""
117 (?:https?://)? # http(s):// (optional)
118 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
119 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
120 (?:.*?\#/)? # handle anchor (#/) redirect urls
121 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
122 (?: # the various things that can precede the ID:
123 (?:(?:v|embed|e)/) # v/ or embed/ or e/
124 |(?: # or the v= param in all its forms
125 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
126 (?:\?|\#!?) # the params delimiter ? or # or #!
127 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
130 )? # optional -> youtube.com/xxxx is OK
131 )? # all until now is optional -> you can pass the naked ID
132 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
133 (?(1).+)? # if we found the ID, everything can follow
135 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
136 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
137 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
138 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
139 _NETRC_MACHINE = 'youtube'
140 # Listed in order of quality
141 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
142 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
143 _video_extensions = {
149 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
155 _video_dimensions = {
173 def suitable(self, url):
174 """Receives a URL and returns True if suitable for this IE."""
175 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
177 def report_lang(self):
178 """Report attempt to set language."""
179 self._downloader.to_screen(u'[youtube] Setting language')
181 def report_login(self):
182 """Report attempt to log in."""
183 self._downloader.to_screen(u'[youtube] Logging in')
185 def report_age_confirmation(self):
186 """Report attempt to confirm age."""
187 self._downloader.to_screen(u'[youtube] Confirming age')
189 def report_video_webpage_download(self, video_id):
190 """Report attempt to download video webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
193 def report_video_info_webpage_download(self, video_id):
194 """Report attempt to download video info webpage."""
195 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
197 def report_video_subtitles_download(self, video_id):
198 """Report attempt to download video info webpage."""
199 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
201 def report_information_extraction(self, video_id):
202 """Report attempt to extract video information."""
203 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
205 def report_unavailable_format(self, video_id, format):
206 """Report extracted video URL."""
207 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
209 def report_rtmp_download(self):
210 """Indicate the download will use the RTMP protocol."""
211 self._downloader.to_screen(u'[youtube] RTMP download detected')
213 def _closed_captions_xml_to_srt(self, xml_string):
215 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
216 # TODO parse xml instead of regex
217 for n, (start, dur_tag, dur, caption) in enumerate(texts):
218 if not dur: dur = '4'
220 end = start + float(dur)
221 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
222 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
223 caption = unescapeHTML(caption)
224 caption = unescapeHTML(caption) # double cycle, intentional
225 srt += str(n+1) + '\n'
226 srt += start + ' --> ' + end + '\n'
227 srt += caption + '\n\n'
230 def _print_formats(self, formats):
231 print('Available formats:')
233 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
235 def _real_initialize(self):
236 if self._downloader is None:
241 downloader_params = self._downloader.params
243 # Attempt to use provided username and password or .netrc data
244 if downloader_params.get('username', None) is not None:
245 username = downloader_params['username']
246 password = downloader_params['password']
247 elif downloader_params.get('usenetrc', False):
249 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
254 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
255 except (IOError, netrc.NetrcParseError), err:
256 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % u(err))
260 request = urllib2.Request(self._LANG_URL)
263 urllib2.urlopen(request).read()
264 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
265 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % u(err))
268 # No authentication to be performed
274 'current_form': 'loginForm',
276 'action_login': 'Log In',
277 'username': username,
278 'password': password,
280 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
283 login_results = urllib2.urlopen(request).read()
284 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
285 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
287 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
288 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % u(err))
294 'action_confirm': 'Confirm',
296 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
298 self.report_age_confirmation()
299 age_results = urllib2.urlopen(request).read()
300 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
301 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % u(err))
304 def _real_extract(self, url):
305 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
306 mobj = re.search(self._NEXT_URL_RE, url)
308 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
310 # Extract video id from URL
311 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
313 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
315 video_id = mobj.group(2)
318 self.report_video_webpage_download(video_id)
319 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
321 video_webpage = urllib2.urlopen(request).read()
322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
323 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
326 # Attempt to extract SWF player URL
327 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
329 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
334 self.report_video_info_webpage_download(video_id)
335 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
336 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
337 % (video_id, el_type))
338 request = urllib2.Request(video_info_url)
340 video_info_webpage = urllib2.urlopen(request).read()
341 video_info = parse_qs(video_info_webpage)
342 if 'token' in video_info:
344 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
345 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % u(err))
347 if 'token' not in video_info:
348 if 'reason' in video_info:
349 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
351 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
354 # Check for "rental" videos
355 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
356 self._downloader.trouble(u'ERROR: "rental" videos not supported')
359 # Start extracting information
360 self.report_information_extraction(video_id)
363 if 'author' not in video_info:
364 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
366 video_uploader = urllib.unquote_plus(video_info['author'][0])
369 if 'title' not in video_info:
370 self._downloader.trouble(u'ERROR: unable to extract video title')
372 video_title = urllib.unquote_plus(video_info['title'][0])
373 video_title = video_title.decode('utf-8')
376 if 'thumbnail_url' not in video_info:
377 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
379 else: # don't panic if we can't find it
380 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
384 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
386 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
387 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
388 for expression in format_expressions:
390 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
395 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
396 if video_description: video_description = clean_html(video_description)
397 else: video_description = ''
400 video_subtitles = None
401 if self._downloader.params.get('writesubtitles', False):
403 self.report_video_subtitles_download(video_id)
404 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
406 srt_list = urllib2.urlopen(request).read()
407 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
408 raise Trouble(u'WARNING: unable to download video subtitles: %s' % u(err))
409 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
410 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
411 if not srt_lang_list:
412 raise Trouble(u'WARNING: video has no closed captions')
413 if self._downloader.params.get('subtitleslang', False):
414 srt_lang = self._downloader.params.get('subtitleslang')
415 elif 'en' in srt_lang_list:
418 srt_lang = srt_lang_list.keys()[0]
419 if not srt_lang in srt_lang_list:
420 raise Trouble(u'WARNING: no closed captions found in the specified language')
421 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
423 srt_xml = urllib2.urlopen(request).read()
424 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
425 raise Trouble(u'WARNING: unable to download video subtitles: %s' % u(err))
427 raise Trouble(u'WARNING: unable to download video subtitles')
428 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
429 except Trouble as trouble:
430 self._downloader.trouble(trouble[0])
432 if 'length_seconds' not in video_info:
433 self._downloader.trouble(u'WARNING: unable to extract video duration')
436 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
439 video_token = urllib.unquote_plus(video_info['token'][0])
441 # Decide which formats to download
442 req_format = self._downloader.params.get('format', None)
444 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
445 self.report_rtmp_download()
446 video_url_list = [(None, video_info['conn'][0])]
447 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
448 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
449 url_data = [parse_qs(uds) for uds in url_data_strs]
450 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
451 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
453 format_limit = self._downloader.params.get('format_limit', None)
454 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
455 if format_limit is not None and format_limit in available_formats:
456 format_list = available_formats[available_formats.index(format_limit):]
458 format_list = available_formats
459 existing_formats = [x for x in format_list if x in url_map]
460 if len(existing_formats) == 0:
461 self._downloader.trouble(u'ERROR: no known formats available for video')
463 if self._downloader.params.get('listformats', None):
464 self._print_formats(existing_formats)
466 if req_format is None or req_format == 'best':
467 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
468 elif req_format == 'worst':
469 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
470 elif req_format in ('-1', 'all'):
471 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
473 # Specific formats. We pick the first in a slash-delimeted sequence.
474 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
475 req_formats = req_format.split('/')
476 video_url_list = None
477 for rf in req_formats:
479 video_url_list = [(rf, url_map[rf])]
481 if video_url_list is None:
482 self._downloader.trouble(u'ERROR: requested format not available')
485 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
489 for format_param, video_real_url in video_url_list:
491 video_extension = self._video_extensions.get(format_param, 'flv')
493 video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
494 self._video_dimensions.get(format_param, '???'))
497 'id': video_id.decode('utf-8'),
498 'url': video_real_url.decode('utf-8'),
499 'uploader': video_uploader.decode('utf-8'),
500 'upload_date': upload_date,
501 'title': video_title,
502 'ext': video_extension.decode('utf-8'),
503 'format': video_format,
504 'thumbnail': video_thumbnail.decode('utf-8'),
505 'description': video_description,
506 'player_url': player_url,
507 'subtitles': video_subtitles,
508 'duration': video_duration
513 class MetacafeIE(InfoExtractor):
514 """Information Extractor for metacafe.com."""
516 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
517 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
518 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
519 IE_NAME = u'metacafe'
521 def __init__(self, downloader=None):
522 InfoExtractor.__init__(self, downloader)
524 def report_disclaimer(self):
525 """Report disclaimer retrieval."""
526 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
528 def report_age_confirmation(self):
529 """Report attempt to confirm age."""
530 self._downloader.to_screen(u'[metacafe] Confirming age')
532 def report_download_webpage(self, video_id):
533 """Report webpage download."""
534 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
536 def report_extraction(self, video_id):
537 """Report information extraction."""
538 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
540 def _real_initialize(self):
541 # Retrieve disclaimer
542 request = urllib2.Request(self._DISCLAIMER)
544 self.report_disclaimer()
545 disclaimer = urllib2.urlopen(request).read()
546 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
547 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % u(err))
553 'submit': "Continue - I'm over 18",
555 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
557 self.report_age_confirmation()
558 disclaimer = urllib2.urlopen(request).read()
559 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
560 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % u(err))
563 def _real_extract(self, url):
564 # Extract id and simplified title from URL
565 mobj = re.match(self._VALID_URL, url)
567 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
570 video_id = mobj.group(1)
572 # Check if video comes from YouTube
573 mobj2 = re.match(r'^yt-(.*)$', video_id)
574 if mobj2 is not None:
575 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
578 # Retrieve video webpage to extract further information
579 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
581 self.report_download_webpage(video_id)
582 webpage = urllib2.urlopen(request).read()
583 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
584 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % u(err))
587 # Extract URL, uploader and title from webpage
588 self.report_extraction(video_id)
589 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
591 mediaURL = urllib.unquote(mobj.group(1))
592 video_extension = mediaURL[-3:]
594 # Extract gdaKey if available
595 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
599 gdaKey = mobj.group(1)
600 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
602 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
604 self._downloader.trouble(u'ERROR: unable to extract media URL')
606 vardict = parse_qs(mobj.group(1))
607 if 'mediaData' not in vardict:
608 self._downloader.trouble(u'ERROR: unable to extract media URL')
610 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
612 self._downloader.trouble(u'ERROR: unable to extract media URL')
614 mediaURL = mobj.group(1).replace('\\/', '/')
615 video_extension = mediaURL[-3:]
616 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
618 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
620 self._downloader.trouble(u'ERROR: unable to extract title')
622 video_title = mobj.group(1).decode('utf-8')
624 mobj = re.search(r'submitter=(.*?);', webpage)
626 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
628 video_uploader = mobj.group(1)
631 'id': video_id.decode('utf-8'),
632 'url': video_url.decode('utf-8'),
633 'uploader': video_uploader.decode('utf-8'),
635 'title': video_title,
636 'ext': video_extension.decode('utf-8'),
640 class DailymotionIE(InfoExtractor):
641 """Information Extractor for Dailymotion"""
643 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
644 IE_NAME = u'dailymotion'
646 def __init__(self, downloader=None):
647 InfoExtractor.__init__(self, downloader)
649 def report_download_webpage(self, video_id):
650 """Report webpage download."""
651 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
653 def report_extraction(self, video_id):
654 """Report information extraction."""
655 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
657 def _real_extract(self, url):
658 # Extract id and simplified title from URL
659 mobj = re.match(self._VALID_URL, url)
661 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
664 video_id = mobj.group(1).split('_')[0].split('?')[0]
666 video_extension = 'mp4'
668 # Retrieve video webpage to extract further information
669 request = urllib2.Request(url)
670 request.add_header('Cookie', 'family_filter=off')
672 self.report_download_webpage(video_id)
673 webpage = urllib2.urlopen(request).read()
674 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
675 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % u(err))
678 # Extract URL, uploader and title from webpage
679 self.report_extraction(video_id)
680 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
682 self._downloader.trouble(u'ERROR: unable to extract media URL')
684 flashvars = urllib.unquote(mobj.group(1))
686 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
689 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
692 self._downloader.trouble(u'ERROR: unable to extract video URL')
695 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
697 self._downloader.trouble(u'ERROR: unable to extract video URL')
700 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
702 # TODO: support choosing qualities
704 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
706 self._downloader.trouble(u'ERROR: unable to extract title')
708 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
710 video_uploader = None
711 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
713 # lookin for official user
714 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
715 if mobj_official is None:
716 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
718 video_uploader = mobj_official.group(1)
720 video_uploader = mobj.group(1)
722 video_upload_date = None
723 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
725 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
728 'id': video_id.decode('utf-8'),
729 'url': video_url.decode('utf-8'),
730 'uploader': video_uploader.decode('utf-8'),
731 'upload_date': video_upload_date,
732 'title': video_title,
733 'ext': video_extension.decode('utf-8'),
737 class GoogleIE(InfoExtractor):
738 """Information extractor for video.google.com."""
740 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
741 IE_NAME = u'video.google'
743 def __init__(self, downloader=None):
744 InfoExtractor.__init__(self, downloader)
746 def report_download_webpage(self, video_id):
747 """Report webpage download."""
748 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
750 def report_extraction(self, video_id):
751 """Report information extraction."""
752 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
754 def _real_extract(self, url):
755 # Extract id from URL
756 mobj = re.match(self._VALID_URL, url)
758 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
761 video_id = mobj.group(1)
763 video_extension = 'mp4'
765 # Retrieve video webpage to extract further information
766 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
768 self.report_download_webpage(video_id)
769 webpage = urllib2.urlopen(request).read()
770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
771 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
774 # Extract URL, uploader, and title from webpage
775 self.report_extraction(video_id)
776 mobj = re.search(r"download_url:'([^']+)'", webpage)
778 video_extension = 'flv'
779 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
781 self._downloader.trouble(u'ERROR: unable to extract media URL')
783 mediaURL = urllib.unquote(mobj.group(1))
784 mediaURL = mediaURL.replace('\\x3d', '\x3d')
785 mediaURL = mediaURL.replace('\\x26', '\x26')
789 mobj = re.search(r'<title>(.*)</title>', webpage)
791 self._downloader.trouble(u'ERROR: unable to extract title')
793 video_title = mobj.group(1).decode('utf-8')
795 # Extract video description
796 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
798 self._downloader.trouble(u'ERROR: unable to extract video description')
800 video_description = mobj.group(1).decode('utf-8')
801 if not video_description:
802 video_description = 'No description available.'
804 # Extract video thumbnail
805 if self._downloader.params.get('forcethumbnail', False):
806 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
808 webpage = urllib2.urlopen(request).read()
809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
810 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
812 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
814 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
816 video_thumbnail = mobj.group(1)
817 else: # we need something to pass to process_info
821 'id': video_id.decode('utf-8'),
822 'url': video_url.decode('utf-8'),
825 'title': video_title,
826 'ext': video_extension.decode('utf-8'),
830 class PhotobucketIE(InfoExtractor):
831 """Information extractor for photobucket.com."""
833 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
834 IE_NAME = u'photobucket'
836 def __init__(self, downloader=None):
837 InfoExtractor.__init__(self, downloader)
839 def report_download_webpage(self, video_id):
840 """Report webpage download."""
841 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
843 def report_extraction(self, video_id):
844 """Report information extraction."""
845 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
847 def _real_extract(self, url):
848 # Extract id from URL
849 mobj = re.match(self._VALID_URL, url)
851 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
854 video_id = mobj.group(1)
856 video_extension = 'flv'
858 # Retrieve video webpage to extract further information
859 request = urllib2.Request(url)
861 self.report_download_webpage(video_id)
862 webpage = urllib2.urlopen(request).read()
863 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
864 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
867 # Extract URL, uploader, and title from webpage
868 self.report_extraction(video_id)
869 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
871 self._downloader.trouble(u'ERROR: unable to extract media URL')
873 mediaURL = urllib.unquote(mobj.group(1))
877 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
879 self._downloader.trouble(u'ERROR: unable to extract title')
881 video_title = mobj.group(1).decode('utf-8')
883 video_uploader = mobj.group(2).decode('utf-8')
886 'id': video_id.decode('utf-8'),
887 'url': video_url.decode('utf-8'),
888 'uploader': video_uploader,
890 'title': video_title,
891 'ext': video_extension.decode('utf-8'),
895 class YahooIE(InfoExtractor):
896 """Information extractor for video.yahoo.com."""
898 # _VALID_URL matches all Yahoo! Video URLs
899 # _VPAGE_URL matches only the extractable '/watch/' URLs
900 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
901 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
902 IE_NAME = u'video.yahoo'
904 def __init__(self, downloader=None):
905 InfoExtractor.__init__(self, downloader)
907 def report_download_webpage(self, video_id):
908 """Report webpage download."""
909 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
911 def report_extraction(self, video_id):
912 """Report information extraction."""
913 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
915 def _real_extract(self, url, new_video=True):
916 # Extract ID from URL
917 mobj = re.match(self._VALID_URL, url)
919 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
922 video_id = mobj.group(2)
923 video_extension = 'flv'
925 # Rewrite valid but non-extractable URLs as
926 # extractable English language /watch/ URLs
927 if re.match(self._VPAGE_URL, url) is None:
928 request = urllib2.Request(url)
930 webpage = urllib2.urlopen(request).read()
931 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
932 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
935 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
937 self._downloader.trouble(u'ERROR: Unable to extract id field')
939 yahoo_id = mobj.group(1)
941 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
943 self._downloader.trouble(u'ERROR: Unable to extract vid field')
945 yahoo_vid = mobj.group(1)
947 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
948 return self._real_extract(url, new_video=False)
950 # Retrieve video webpage to extract further information
951 request = urllib2.Request(url)
953 self.report_download_webpage(video_id)
954 webpage = urllib2.urlopen(request).read()
955 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
956 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
959 # Extract uploader and title from webpage
960 self.report_extraction(video_id)
961 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
963 self._downloader.trouble(u'ERROR: unable to extract video title')
965 video_title = mobj.group(1).decode('utf-8')
967 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
969 self._downloader.trouble(u'ERROR: unable to extract video uploader')
971 video_uploader = mobj.group(1).decode('utf-8')
973 # Extract video thumbnail
974 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
976 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
978 video_thumbnail = mobj.group(1).decode('utf-8')
980 # Extract video description
981 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
983 self._downloader.trouble(u'ERROR: unable to extract video description')
985 video_description = mobj.group(1).decode('utf-8')
986 if not video_description:
987 video_description = 'No description available.'
989 # Extract video height and width
990 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
992 self._downloader.trouble(u'ERROR: unable to extract video height')
994 yv_video_height = mobj.group(1)
996 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
998 self._downloader.trouble(u'ERROR: unable to extract video width')
1000 yv_video_width = mobj.group(1)
1002 # Retrieve video playlist to extract media URL
1003 # I'm not completely sure what all these options are, but we
1004 # seem to need most of them, otherwise the server sends a 401.
1005 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1006 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1007 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1008 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1009 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1011 self.report_download_webpage(video_id)
1012 webpage = urllib2.urlopen(request).read()
1013 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1014 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1017 # Extract media URL from playlist XML
1018 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1020 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1022 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1023 video_url = unescapeHTML(video_url)
1026 'id': video_id.decode('utf-8'),
1028 'uploader': video_uploader,
1029 'upload_date': None,
1030 'title': video_title,
1031 'ext': video_extension.decode('utf-8'),
1032 'thumbnail': video_thumbnail.decode('utf-8'),
1033 'description': video_description,
1037 class VimeoIE(InfoExtractor):
1038 """Information extractor for vimeo.com."""
1040 # _VALID_URL matches Vimeo URLs
1041 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1044 def __init__(self, downloader=None):
1045 InfoExtractor.__init__(self, downloader)
1047 def report_download_webpage(self, video_id):
1048 """Report webpage download."""
1049 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1051 def report_extraction(self, video_id):
1052 """Report information extraction."""
1053 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1055 def _real_extract(self, url, new_video=True):
1056 # Extract ID from URL
1057 mobj = re.match(self._VALID_URL, url)
1059 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1062 video_id = mobj.group(1)
1064 # Retrieve video webpage to extract further information
1065 request = urllib2.Request(url, None, std_headers)
1067 self.report_download_webpage(video_id)
1068 webpage = urllib2.urlopen(request).read()
1069 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1070 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1073 # Now we begin extracting as much information as we can from what we
1074 # retrieved. First we extract the information common to all extractors,
1075 # and latter we extract those that are Vimeo specific.
1076 self.report_extraction(video_id)
1078 # Extract the config JSON
1079 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1081 config = json.loads(config)
1083 self._downloader.trouble(u'ERROR: unable to extract info section')
1087 video_title = config["video"]["title"]
1090 video_uploader = config["video"]["owner"]["name"]
1092 # Extract video thumbnail
1093 video_thumbnail = config["video"]["thumbnail"]
1095 # Extract video description
1096 video_description = get_element_by_id("description", webpage.decode('utf8'))
1097 if video_description: video_description = clean_html(video_description)
1098 else: video_description = ''
1100 # Extract upload date
1101 video_upload_date = None
1102 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1103 if mobj is not None:
1104 video_upload_date = mobj.group(1)
1106 # Vimeo specific: extract request signature and timestamp
1107 sig = config['request']['signature']
1108 timestamp = config['request']['timestamp']
1110 # Vimeo specific: extract video codec and quality information
1111 # First consider quality, then codecs, then take everything
1112 # TODO bind to format param
1113 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1114 files = { 'hd': [], 'sd': [], 'other': []}
1115 for codec_name, codec_extension in codecs:
1116 if codec_name in config["video"]["files"]:
1117 if 'hd' in config["video"]["files"][codec_name]:
1118 files['hd'].append((codec_name, codec_extension, 'hd'))
1119 elif 'sd' in config["video"]["files"][codec_name]:
1120 files['sd'].append((codec_name, codec_extension, 'sd'))
1122 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1124 for quality in ('hd', 'sd', 'other'):
1125 if len(files[quality]) > 0:
1126 video_quality = files[quality][0][2]
1127 video_codec = files[quality][0][0]
1128 video_extension = files[quality][0][1]
1129 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1132 self._downloader.trouble(u'ERROR: no known codec found')
1135 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1136 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1141 'uploader': video_uploader,
1142 'upload_date': video_upload_date,
1143 'title': video_title,
1144 'ext': video_extension,
1145 'thumbnail': video_thumbnail,
1146 'description': video_description,
1150 class ArteTvIE(InfoExtractor):
1151 """arte.tv information extractor."""
1153 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1154 _LIVE_URL = r'index-[0-9]+\.html$'
1156 IE_NAME = u'arte.tv'
1158 def __init__(self, downloader=None):
1159 InfoExtractor.__init__(self, downloader)
1161 def report_download_webpage(self, video_id):
1162 """Report webpage download."""
1163 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1165 def report_extraction(self, video_id):
1166 """Report information extraction."""
1167 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1169 def fetch_webpage(self, url):
1170 self._downloader.increment_downloads()
1171 request = urllib2.Request(url)
1173 self.report_download_webpage(url)
1174 webpage = urllib2.urlopen(request).read()
1175 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1176 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1178 except ValueError, err:
1179 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1183 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1184 page = self.fetch_webpage(url)
1185 mobj = re.search(regex, page, regexFlags)
1189 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1192 for (i, key, err) in matchTuples:
1193 if mobj.group(i) is None:
1194 self._downloader.trouble(err)
1197 info[key] = mobj.group(i)
1201 def extractLiveStream(self, url):
1202 video_lang = url.split('/')[-4]
1203 info = self.grep_webpage(
1205 r'src="(.*?/videothek_js.*?\.js)',
1208 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1211 http_host = url.split('/')[2]
1212 next_url = 'http://%s%s' % (http_host, urllib.unquote(info.get('url')))
1213 info = self.grep_webpage(
1215 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1216 '(http://.*?\.swf).*?' +
1220 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1221 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1222 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1225 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1227 def extractPlus7Stream(self, url):
1228 video_lang = url.split('/')[-3]
1229 info = self.grep_webpage(
1231 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1234 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1237 next_url = urllib.unquote(info.get('url'))
1238 info = self.grep_webpage(
1240 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1243 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1246 next_url = urllib.unquote(info.get('url'))
1248 info = self.grep_webpage(
1250 r'<video id="(.*?)".*?>.*?' +
1251 '<name>(.*?)</name>.*?' +
1252 '<dateVideo>(.*?)</dateVideo>.*?' +
1253 '<url quality="hd">(.*?)</url>',
1256 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1257 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1258 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1259 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1264 'id': info.get('id'),
1265 'url': urllib.unquote(info.get('url')),
1266 'uploader': u'arte.tv',
1267 'upload_date': info.get('date'),
1268 'title': info.get('title'),
1274 def _real_extract(self, url):
1275 video_id = url.split('/')[-1]
1276 self.report_extraction(video_id)
1278 if re.search(self._LIVE_URL, video_id) is not None:
1279 self.extractLiveStream(url)
1282 info = self.extractPlus7Stream(url)
1287 class GenericIE(InfoExtractor):
1288 """Generic last-resort information extractor."""
1291 IE_NAME = u'generic'
1293 def __init__(self, downloader=None):
1294 InfoExtractor.__init__(self, downloader)
1296 def report_download_webpage(self, video_id):
1297 """Report webpage download."""
1298 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1299 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1301 def report_extraction(self, video_id):
1302 """Report information extraction."""
1303 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1305 def report_following_redirect(self, new_url):
1306 """Report information extraction."""
1307 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1309 def _test_redirect(self, url):
1310 """Check if it is a redirect, like url shorteners, in case restart chain."""
1311 class HeadRequest(urllib2.Request):
1312 def get_method(self):
1315 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1317 Subclass the HTTPRedirectHandler to make it use our
1318 HeadRequest also on the redirected URL
1320 def redirect_request(self, req, fp, code, msg, headers, newurl):
1321 if code in (301, 302, 303, 307):
1322 newurl = newurl.replace(' ', '%20')
1323 newheaders = dict((k,v) for k,v in req.headers.items()
1324 if k.lower() not in ("content-length", "content-type"))
1325 return HeadRequest(newurl,
1327 origin_req_host=req.get_origin_req_host(),
1330 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1332 class HTTPMethodFallback(urllib2.BaseHandler):
1334 Fallback to GET if HEAD is not allowed (405 HTTP error)
1336 def http_error_405(self, req, fp, code, msg, headers):
1340 newheaders = dict((k,v) for k,v in req.headers.items()
1341 if k.lower() not in ("content-length", "content-type"))
1342 return self.parent.open(urllib2.Request(req.get_full_url(),
1344 origin_req_host=req.get_origin_req_host(),
1348 opener = urllib2.OpenerDirector()
1349 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1350 HTTPMethodFallback, HEADRedirectHandler,
1351 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1352 opener.add_handler(handler())
1354 response = opener.open(HeadRequest(url))
1355 new_url = response.geturl()
1357 if url == new_url: return False
1359 self.report_following_redirect(new_url)
1360 self._downloader.download([new_url])
1363 def _real_extract(self, url):
1364 if self._test_redirect(url): return
1366 video_id = url.split('/')[-1]
1367 request = urllib2.Request(url)
1369 self.report_download_webpage(video_id)
1370 webpage = urllib2.urlopen(request).read()
1371 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1372 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1374 except ValueError, err:
1375 # since this is the last-resort InfoExtractor, if
1376 # this error is thrown, it'll be thrown here
1377 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380 self.report_extraction(video_id)
1381 # Start with something easy: JW Player in SWFObject
1382 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1384 # Broaden the search a little bit
1385 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1387 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1390 # It's possible that one of the regexes
1391 # matched, but returned an empty group:
1392 if mobj.group(1) is None:
1393 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1396 video_url = urllib.unquote(mobj.group(1))
1397 video_id = os.path.basename(video_url)
1399 # here's a fun little line of code for you:
1400 video_extension = os.path.splitext(video_id)[1][1:]
1401 video_id = os.path.splitext(video_id)[0]
1403 # it's tempting to parse this further, but you would
1404 # have to take into account all the variations like
1405 # Video Title - Site Name
1406 # Site Name | Video Title
1407 # Video Title - Tagline | Site Name
1408 # and so on and so forth; it's just not practical
1409 mobj = re.search(r'<title>(.*)</title>', webpage)
1411 self._downloader.trouble(u'ERROR: unable to extract title')
1413 video_title = mobj.group(1).decode('utf-8')
1415 # video uploader is domain name
1416 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1418 self._downloader.trouble(u'ERROR: unable to extract title')
1420 video_uploader = mobj.group(1).decode('utf-8')
1423 'id': video_id.decode('utf-8'),
1424 'url': video_url.decode('utf-8'),
1425 'uploader': video_uploader,
1426 'upload_date': None,
1427 'title': video_title,
1428 'ext': video_extension.decode('utf-8'),
1432 class YoutubeSearchIE(InfoExtractor):
1433 """Information Extractor for YouTube search queries."""
1434 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1435 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1436 _max_youtube_results = 1000
1437 IE_NAME = u'youtube:search'
1439 def __init__(self, downloader=None):
1440 InfoExtractor.__init__(self, downloader)
1442 def report_download_page(self, query, pagenum):
1443 """Report attempt to download search page with given number."""
1444 query = query.decode(preferredencoding())
1445 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1447 def _real_extract(self, query):
1448 mobj = re.match(self._VALID_URL, query)
1450 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1453 prefix, query = query.split(':')
1455 query = query.encode('utf-8')
1457 self._download_n_results(query, 1)
1459 elif prefix == 'all':
1460 self._download_n_results(query, self._max_youtube_results)
1466 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1468 elif n > self._max_youtube_results:
1469 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1470 n = self._max_youtube_results
1471 self._download_n_results(query, n)
1473 except ValueError: # parsing prefix as integer fails
1474 self._download_n_results(query, 1)
1477 def _download_n_results(self, query, n):
1478 """Downloads a specified number of results for a query"""
1484 while (50 * pagenum) < limit:
1485 self.report_download_page(query, pagenum+1)
1486 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1487 request = urllib2.Request(result_url)
1489 data = urllib2.urlopen(request).read()
1490 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1491 self._downloader.trouble(u'ERROR: unable to download API page: %s' % u(err))
1493 api_response = json.loads(data)['data']
1495 new_ids = list(video['id'] for video in api_response['items'])
1496 video_ids += new_ids
1498 limit = min(n, api_response['totalItems'])
1501 if len(video_ids) > n:
1502 video_ids = video_ids[:n]
1503 for id in video_ids:
1504 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1508 class GoogleSearchIE(InfoExtractor):
1509 """Information Extractor for Google Video search queries."""
1510 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1511 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1512 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1513 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1514 _max_google_results = 1000
1515 IE_NAME = u'video.google:search'
1517 def __init__(self, downloader=None):
1518 InfoExtractor.__init__(self, downloader)
1520 def report_download_page(self, query, pagenum):
1521 """Report attempt to download playlist page with given number."""
1522 query = query.decode(preferredencoding())
1523 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1525 def _real_extract(self, query):
1526 mobj = re.match(self._VALID_URL, query)
1528 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1531 prefix, query = query.split(':')
1533 query = query.encode('utf-8')
1535 self._download_n_results(query, 1)
1537 elif prefix == 'all':
1538 self._download_n_results(query, self._max_google_results)
1544 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1546 elif n > self._max_google_results:
1547 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1548 n = self._max_google_results
1549 self._download_n_results(query, n)
1551 except ValueError: # parsing prefix as integer fails
1552 self._download_n_results(query, 1)
1555 def _download_n_results(self, query, n):
1556 """Downloads a specified number of results for a query"""
1562 self.report_download_page(query, pagenum)
1563 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1564 request = urllib2.Request(result_url)
1566 page = urllib2.urlopen(request).read()
1567 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1568 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1571 # Extract video identifiers
1572 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1573 video_id = mobj.group(1)
1574 if video_id not in video_ids:
1575 video_ids.append(video_id)
1576 if len(video_ids) == n:
1577 # Specified n videos reached
1578 for id in video_ids:
1579 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1582 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1583 for id in video_ids:
1584 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1587 pagenum = pagenum + 1
1590 class YahooSearchIE(InfoExtractor):
1591 """Information Extractor for Yahoo! Video search queries."""
1592 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1593 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1594 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1595 _MORE_PAGES_INDICATOR = r'\s*Next'
1596 _max_yahoo_results = 1000
1597 IE_NAME = u'video.yahoo:search'
1599 def __init__(self, downloader=None):
1600 InfoExtractor.__init__(self, downloader)
1602 def report_download_page(self, query, pagenum):
1603 """Report attempt to download playlist page with given number."""
1604 query = query.decode(preferredencoding())
1605 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1607 def _real_extract(self, query):
1608 mobj = re.match(self._VALID_URL, query)
1610 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1613 prefix, query = query.split(':')
1615 query = query.encode('utf-8')
1617 self._download_n_results(query, 1)
1619 elif prefix == 'all':
1620 self._download_n_results(query, self._max_yahoo_results)
1626 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1628 elif n > self._max_yahoo_results:
1629 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1630 n = self._max_yahoo_results
1631 self._download_n_results(query, n)
1633 except ValueError: # parsing prefix as integer fails
1634 self._download_n_results(query, 1)
1637 def _download_n_results(self, query, n):
1638 """Downloads a specified number of results for a query"""
1641 already_seen = set()
1645 self.report_download_page(query, pagenum)
1646 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1647 request = urllib2.Request(result_url)
1649 page = urllib2.urlopen(request).read()
1650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1651 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1654 # Extract video identifiers
1655 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1656 video_id = mobj.group(1)
1657 if video_id not in already_seen:
1658 video_ids.append(video_id)
1659 already_seen.add(video_id)
1660 if len(video_ids) == n:
1661 # Specified n videos reached
1662 for id in video_ids:
1663 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1666 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1667 for id in video_ids:
1668 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1671 pagenum = pagenum + 1
1674 class YoutubePlaylistIE(InfoExtractor):
1675 """Information Extractor for YouTube playlists."""
1677 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1678 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1679 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1680 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1681 IE_NAME = u'youtube:playlist'
1683 def __init__(self, downloader=None):
1684 InfoExtractor.__init__(self, downloader)
1686 def report_download_page(self, playlist_id, pagenum):
1687 """Report attempt to download playlist page with given number."""
1688 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1690 def _real_extract(self, url):
1691 # Extract playlist id
1692 mobj = re.match(self._VALID_URL, url)
1694 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1698 if mobj.group(3) is not None:
1699 self._downloader.download([mobj.group(3)])
1702 # Download playlist pages
1703 # prefix is 'p' as default for playlists but there are other types that need extra care
1704 playlist_prefix = mobj.group(1)
1705 if playlist_prefix == 'a':
1706 playlist_access = 'artist'
1708 playlist_prefix = 'p'
1709 playlist_access = 'view_play_list'
1710 playlist_id = mobj.group(2)
1715 self.report_download_page(playlist_id, pagenum)
1716 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1717 request = urllib2.Request(url)
1719 page = urllib2.urlopen(request).read()
1720 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1721 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1724 # Extract video identifiers
1726 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1727 if mobj.group(1) not in ids_in_page:
1728 ids_in_page.append(mobj.group(1))
1729 video_ids.extend(ids_in_page)
1731 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1733 pagenum = pagenum + 1
1735 playliststart = self._downloader.params.get('playliststart', 1) - 1
1736 playlistend = self._downloader.params.get('playlistend', -1)
1737 if playlistend == -1:
1738 video_ids = video_ids[playliststart:]
1740 video_ids = video_ids[playliststart:playlistend]
1742 for id in video_ids:
1743 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1747 class YoutubeChannelIE(InfoExtractor):
1748 """Information Extractor for YouTube channels."""
1750 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1751 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1752 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1753 IE_NAME = u'youtube:channel'
1755 def report_download_page(self, channel_id, pagenum):
1756 """Report attempt to download channel page with given number."""
1757 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1759 def _real_extract(self, url):
1760 # Extract channel id
1761 mobj = re.match(self._VALID_URL, url)
1763 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1766 # Download channel pages
1767 channel_id = mobj.group(1)
1772 self.report_download_page(channel_id, pagenum)
1773 url = self._TEMPLATE_URL % (channel_id, pagenum)
1774 request = urllib2.Request(url)
1776 page = urllib2.urlopen(request).read()
1777 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1778 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1781 # Extract video identifiers
1783 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1784 if mobj.group(1) not in ids_in_page:
1785 ids_in_page.append(mobj.group(1))
1786 video_ids.extend(ids_in_page)
1788 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1790 pagenum = pagenum + 1
1792 for id in video_ids:
1793 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1797 class YoutubeUserIE(InfoExtractor):
1798 """Information Extractor for YouTube users."""
1800 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1801 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1802 _GDATA_PAGE_SIZE = 50
1803 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1804 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1805 IE_NAME = u'youtube:user'
1807 def __init__(self, downloader=None):
1808 InfoExtractor.__init__(self, downloader)
1810 def report_download_page(self, username, start_index):
1811 """Report attempt to download user page."""
1812 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1813 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1815 def _real_extract(self, url):
1817 mobj = re.match(self._VALID_URL, url)
1819 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1822 username = mobj.group(1)
1824 # Download video ids using YouTube Data API. Result size per
1825 # query is limited (currently to 50 videos) so we need to query
1826 # page by page until there are no video ids - it means we got
1833 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1834 self.report_download_page(username, start_index)
1836 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1839 page = urllib2.urlopen(request).read()
1840 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1841 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1844 # Extract video identifiers
1847 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1848 if mobj.group(1) not in ids_in_page:
1849 ids_in_page.append(mobj.group(1))
1851 video_ids.extend(ids_in_page)
1853 # A little optimization - if current page is not
1854 # "full", ie. does not contain PAGE_SIZE video ids then
1855 # we can assume that this page is the last one - there
1856 # are no more ids on further pages - no need to query
1859 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1864 all_ids_count = len(video_ids)
1865 playliststart = self._downloader.params.get('playliststart', 1) - 1
1866 playlistend = self._downloader.params.get('playlistend', -1)
1868 if playlistend == -1:
1869 video_ids = video_ids[playliststart:]
1871 video_ids = video_ids[playliststart:playlistend]
1873 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1874 (username, all_ids_count, len(video_ids)))
1876 for video_id in video_ids:
1877 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1880 class BlipTVUserIE(InfoExtractor):
1881 """Information Extractor for blip.tv users."""
1883 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1885 IE_NAME = u'blip.tv:user'
1887 def __init__(self, downloader=None):
1888 InfoExtractor.__init__(self, downloader)
1890 def report_download_page(self, username, pagenum):
1891 """Report attempt to download user page."""
1892 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1893 (self.IE_NAME, username, pagenum))
1895 def _real_extract(self, url):
1897 mobj = re.match(self._VALID_URL, url)
1899 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1902 username = mobj.group(1)
1904 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1906 request = urllib2.Request(url)
1909 page = urllib2.urlopen(request).read().decode('utf-8')
1910 mobj = re.search(r'data-users-id="([^"]+)"', page)
1911 page_base = page_base % mobj.group(1)
1912 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1913 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1917 # Download video ids using BlipTV Ajax calls. Result size per
1918 # query is limited (currently to 12 videos) so we need to query
1919 # page by page until there are no video ids - it means we got
1926 self.report_download_page(username, pagenum)
1928 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1931 page = urllib2.urlopen(request).read().decode('utf-8')
1932 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1933 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1936 # Extract video identifiers
1939 for mobj in re.finditer(r'href="/([^"]+)"', page):
1940 if mobj.group(1) not in ids_in_page:
1941 ids_in_page.append(unescapeHTML(mobj.group(1)))
1943 video_ids.extend(ids_in_page)
1945 # A little optimization - if current page is not
1946 # "full", ie. does not contain PAGE_SIZE video ids then
1947 # we can assume that this page is the last one - there
1948 # are no more ids on further pages - no need to query
1951 if len(ids_in_page) < self._PAGE_SIZE:
1956 all_ids_count = len(video_ids)
1957 playliststart = self._downloader.params.get('playliststart', 1) - 1
1958 playlistend = self._downloader.params.get('playlistend', -1)
1960 if playlistend == -1:
1961 video_ids = video_ids[playliststart:]
1963 video_ids = video_ids[playliststart:playlistend]
1965 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1966 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1968 for video_id in video_ids:
1969 self._downloader.download([u'http://blip.tv/'+video_id])
1972 class DepositFilesIE(InfoExtractor):
1973 """Information extractor for depositfiles.com"""
1975 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1976 IE_NAME = u'DepositFiles'
1978 def __init__(self, downloader=None):
1979 InfoExtractor.__init__(self, downloader)
1981 def report_download_webpage(self, file_id):
1982 """Report webpage download."""
1983 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1985 def report_extraction(self, file_id):
1986 """Report information extraction."""
1987 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1989 def _real_extract(self, url):
1990 file_id = url.split('/')[-1]
1991 # Rebuild url in english locale
1992 url = 'http://depositfiles.com/en/files/' + file_id
1994 # Retrieve file webpage with 'Free download' button pressed
1995 free_download_indication = { 'gateway_result' : '1' }
1996 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1998 self.report_download_webpage(file_id)
1999 webpage = urllib2.urlopen(request).read()
2000 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2001 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % u(err))
2004 # Search for the real file URL
2005 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2006 if (mobj is None) or (mobj.group(1) is None):
2007 # Try to figure out reason of the error.
2008 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2009 if (mobj is not None) and (mobj.group(1) is not None):
2010 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2011 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2013 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2016 file_url = mobj.group(1)
2017 file_extension = os.path.splitext(file_url)[1][1:]
2019 # Search for file title
2020 mobj = re.search(r'<b title="(.*?)">', webpage)
2022 self._downloader.trouble(u'ERROR: unable to extract title')
2024 file_title = mobj.group(1).decode('utf-8')
2027 'id': file_id.decode('utf-8'),
2028 'url': file_url.decode('utf-8'),
2030 'upload_date': None,
2031 'title': file_title,
2032 'ext': file_extension.decode('utf-8'),
2036 class FacebookIE(InfoExtractor):
2037 """Information Extractor for Facebook"""
2040 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2041 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2042 _NETRC_MACHINE = 'facebook'
2043 _available_formats = ['video', 'highqual', 'lowqual']
2044 _video_extensions = {
2049 IE_NAME = u'facebook'
2051 def __init__(self, downloader=None):
2052 InfoExtractor.__init__(self, downloader)
2054 def _reporter(self, message):
2055 """Add header and report message."""
2056 self._downloader.to_screen(u'[facebook] %s' % message)
2058 def report_login(self):
2059 """Report attempt to log in."""
2060 self._reporter(u'Logging in')
2062 def report_video_webpage_download(self, video_id):
2063 """Report attempt to download video webpage."""
2064 self._reporter(u'%s: Downloading video webpage' % video_id)
2066 def report_information_extraction(self, video_id):
2067 """Report attempt to extract video information."""
2068 self._reporter(u'%s: Extracting video information' % video_id)
2070 def _parse_page(self, video_webpage):
2071 """Extract video information from page"""
2073 data = {'title': r'\("video_title", "(.*?)"\)',
2074 'description': r'<div class="datawrap">(.*?)</div>',
2075 'owner': r'\("video_owner_name", "(.*?)"\)',
2076 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2079 for piece in data.keys():
2080 mobj = re.search(data[piece], video_webpage)
2081 if mobj is not None:
2082 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2086 for fmt in self._available_formats:
2087 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2088 if mobj is not None:
2089 # URL is in a Javascript segment inside an escaped Unicode format within
2090 # the generally utf-8 page
2091 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2092 video_info['video_urls'] = video_urls
2096 def _real_initialize(self):
2097 if self._downloader is None:
2102 downloader_params = self._downloader.params
2104 # Attempt to use provided username and password or .netrc data
2105 if downloader_params.get('username', None) is not None:
2106 useremail = downloader_params['username']
2107 password = downloader_params['password']
2108 elif downloader_params.get('usenetrc', False):
2110 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2111 if info is not None:
2115 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2116 except (IOError, netrc.NetrcParseError), err:
2117 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % u(err))
2120 if useremail is None:
2129 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2132 login_results = urllib2.urlopen(request).read()
2133 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2134 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2136 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2137 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % u(err))
2140 def _real_extract(self, url):
2141 mobj = re.match(self._VALID_URL, url)
2143 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2145 video_id = mobj.group('ID')
2148 self.report_video_webpage_download(video_id)
2149 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2151 page = urllib2.urlopen(request)
2152 video_webpage = page.read()
2153 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2154 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2157 # Start extracting information
2158 self.report_information_extraction(video_id)
2160 # Extract information
2161 video_info = self._parse_page(video_webpage)
2164 if 'owner' not in video_info:
2165 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2167 video_uploader = video_info['owner']
2170 if 'title' not in video_info:
2171 self._downloader.trouble(u'ERROR: unable to extract video title')
2173 video_title = video_info['title']
2174 video_title = video_title.decode('utf-8')
2177 if 'thumbnail' not in video_info:
2178 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2179 video_thumbnail = ''
2181 video_thumbnail = video_info['thumbnail']
2185 if 'upload_date' in video_info:
2186 upload_time = video_info['upload_date']
2187 timetuple = email.utils.parsedate_tz(upload_time)
2188 if timetuple is not None:
2190 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2195 video_description = video_info.get('description', 'No description available.')
2197 url_map = video_info['video_urls']
2198 if len(url_map.keys()) > 0:
2199 # Decide which formats to download
2200 req_format = self._downloader.params.get('format', None)
2201 format_limit = self._downloader.params.get('format_limit', None)
2203 if format_limit is not None and format_limit in self._available_formats:
2204 format_list = self._available_formats[self._available_formats.index(format_limit):]
2206 format_list = self._available_formats
2207 existing_formats = [x for x in format_list if x in url_map]
2208 if len(existing_formats) == 0:
2209 self._downloader.trouble(u'ERROR: no known formats available for video')
2211 if req_format is None:
2212 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2213 elif req_format == 'worst':
2214 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2215 elif req_format == '-1':
2216 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2219 if req_format not in url_map:
2220 self._downloader.trouble(u'ERROR: requested format not available')
2222 video_url_list = [(req_format, url_map[req_format])] # Specific format
2225 for format_param, video_real_url in video_url_list:
2227 video_extension = self._video_extensions.get(format_param, 'mp4')
2230 'id': video_id.decode('utf-8'),
2231 'url': video_real_url.decode('utf-8'),
2232 'uploader': video_uploader.decode('utf-8'),
2233 'upload_date': upload_date,
2234 'title': video_title,
2235 'ext': video_extension.decode('utf-8'),
2236 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2237 'thumbnail': video_thumbnail.decode('utf-8'),
2238 'description': video_description.decode('utf-8'),
2242 class BlipTVIE(InfoExtractor):
2243 """Information extractor for blip.tv"""
2245 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2246 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2247 IE_NAME = u'blip.tv'
2249 def report_extraction(self, file_id):
2250 """Report information extraction."""
2251 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2253 def report_direct_download(self, title):
2254 """Report information extraction."""
2255 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2257 def _real_extract(self, url):
2258 mobj = re.match(self._VALID_URL, url)
2260 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2267 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2268 request = urllib2.Request(json_url.encode('utf-8'))
2269 self.report_extraction(mobj.group(1))
2272 urlh = urllib2.urlopen(request)
2273 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2274 basename = url.split('/')[-1]
2275 title,ext = os.path.splitext(basename)
2276 title = title.decode('UTF-8')
2277 ext = ext.replace('.', '')
2278 self.report_direct_download(title)
2283 'upload_date': None,
2288 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2289 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % u(err))
2291 if info is None: # Regular URL
2293 json_code = urlh.read()
2294 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2295 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % u(err))
2299 json_data = json.loads(json_code)
2300 if 'Post' in json_data:
2301 data = json_data['Post']
2305 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2306 video_url = data['media']['url']
2307 umobj = re.match(self._URL_EXT, video_url)
2309 raise ValueError('Can not determine filename extension')
2310 ext = umobj.group(1)
2313 'id': data['item_id'],
2315 'uploader': data['display_name'],
2316 'upload_date': upload_date,
2317 'title': data['title'],
2319 'format': data['media']['mimeType'],
2320 'thumbnail': data['thumbnailUrl'],
2321 'description': data['description'],
2322 'player_url': data['embedUrl']
2324 except (ValueError,KeyError), err:
2325 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2328 std_headers['User-Agent'] = 'iTunes/10.6.1'
2332 class MyVideoIE(InfoExtractor):
2333 """Information Extractor for myvideo.de."""
2335 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2336 IE_NAME = u'myvideo'
2338 def __init__(self, downloader=None):
2339 InfoExtractor.__init__(self, downloader)
2341 def report_download_webpage(self, video_id):
2342 """Report webpage download."""
2343 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2345 def report_extraction(self, video_id):
2346 """Report information extraction."""
2347 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2349 def _real_extract(self,url):
2350 mobj = re.match(self._VALID_URL, url)
2352 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2355 video_id = mobj.group(1)
2358 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2360 self.report_download_webpage(video_id)
2361 webpage = urllib2.urlopen(request).read()
2362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2363 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
2366 self.report_extraction(video_id)
2367 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2370 self._downloader.trouble(u'ERROR: unable to extract media URL')
2372 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2374 mobj = re.search('<title>([^<]+)</title>', webpage)
2376 self._downloader.trouble(u'ERROR: unable to extract title')
2379 video_title = mobj.group(1)
2385 'upload_date': None,
2386 'title': video_title,
2390 class ComedyCentralIE(InfoExtractor):
2391 """Information extractor for The Daily Show and Colbert Report """
2393 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2394 IE_NAME = u'comedycentral'
2396 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2398 _video_extensions = {
2406 _video_dimensions = {
2415 def report_extraction(self, episode_id):
2416 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2418 def report_config_download(self, episode_id):
2419 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2421 def report_index_download(self, episode_id):
2422 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2424 def report_player_url(self, episode_id):
2425 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2428 def _print_formats(self, formats):
2429 print('Available formats:')
2431 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2434 def _real_extract(self, url):
2435 mobj = re.match(self._VALID_URL, url)
2437 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2440 if mobj.group('shortname'):
2441 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2442 url = u'http://www.thedailyshow.com/full-episodes/'
2444 url = u'http://www.colbertnation.com/full-episodes/'
2445 mobj = re.match(self._VALID_URL, url)
2446 assert mobj is not None
2448 dlNewest = not mobj.group('episode')
2450 epTitle = mobj.group('showname')
2452 epTitle = mobj.group('episode')
2454 req = urllib2.Request(url)
2455 self.report_extraction(epTitle)
2457 htmlHandle = urllib2.urlopen(req)
2458 html = htmlHandle.read()
2459 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
2463 url = htmlHandle.geturl()
2464 mobj = re.match(self._VALID_URL, url)
2466 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2468 if mobj.group('episode') == '':
2469 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2471 epTitle = mobj.group('episode')
2473 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2475 if len(mMovieParams) == 0:
2476 # The Colbert Report embeds the information in a without
2477 # a URL prefix; so extract the alternate reference
2478 # and then add the URL prefix manually.
2480 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2481 if len(altMovieParams) == 0:
2482 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2485 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2487 playerUrl_raw = mMovieParams[0][0]
2488 self.report_player_url(epTitle)
2490 urlHandle = urllib2.urlopen(playerUrl_raw)
2491 playerUrl = urlHandle.geturl()
2492 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2493 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + u(err))
2496 uri = mMovieParams[0][1]
2497 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2498 self.report_index_download(epTitle)
2500 indexXml = urllib2.urlopen(indexUrl).read()
2501 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2502 self._downloader.trouble(u'ERROR: unable to download episode index: ' + u(err))
2507 idoc = xml.etree.ElementTree.fromstring(indexXml)
2508 itemEls = idoc.findall('.//item')
2509 for itemEl in itemEls:
2510 mediaId = itemEl.findall('./guid')[0].text
2511 shortMediaId = mediaId.split(':')[-1]
2512 showId = mediaId.split(':')[-2].replace('.com', '')
2513 officialTitle = itemEl.findall('./title')[0].text
2514 officialDate = itemEl.findall('./pubDate')[0].text
2516 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2517 urllib.urlencode({'uri': mediaId}))
2518 configReq = urllib2.Request(configUrl)
2519 self.report_config_download(epTitle)
2521 configXml = urllib2.urlopen(configReq).read()
2522 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2523 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
2526 cdoc = xml.etree.ElementTree.fromstring(configXml)
2528 for rendition in cdoc.findall('.//rendition'):
2529 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2533 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2536 if self._downloader.params.get('listformats', None):
2537 self._print_formats([i[0] for i in turls])
2540 # For now, just pick the highest bitrate
2541 format,video_url = turls[-1]
2543 # Get the format arg from the arg stream
2544 req_format = self._downloader.params.get('format', None)
2546 # Select format if we can find one
2549 format, video_url = f, v
2552 # Patch to download from alternative CDN, which does not
2553 # break on current RTMPDump builds
2554 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2555 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2557 if video_url.startswith(broken_cdn):
2558 video_url = video_url.replace(broken_cdn, better_cdn)
2560 effTitle = showId + u'-' + epTitle
2565 'upload_date': officialDate,
2570 'description': officialTitle,
2571 'player_url': None #playerUrl
2574 results.append(info)
2579 class EscapistIE(InfoExtractor):
2580 """Information extractor for The Escapist """
2582 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2583 IE_NAME = u'escapist'
2585 def report_extraction(self, showName):
2586 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2588 def report_config_download(self, showName):
2589 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2591 def _real_extract(self, url):
2592 mobj = re.match(self._VALID_URL, url)
2594 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2596 showName = mobj.group('showname')
2597 videoId = mobj.group('episode')
2599 self.report_extraction(showName)
2601 webPage = urllib2.urlopen(url)
2602 webPageBytes = webPage.read()
2603 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2604 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2605 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2606 self._downloader.trouble(u'ERROR: unable to download webpage: ' + u(err))
2609 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2610 description = unescapeHTML(descMatch.group(1))
2611 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2612 imgUrl = unescapeHTML(imgMatch.group(1))
2613 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2614 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2615 configUrlMatch = re.search('config=(.*)$', playerUrl)
2616 configUrl = urllib2.unquote(configUrlMatch.group(1))
2618 self.report_config_download(showName)
2620 configJSON = urllib2.urlopen(configUrl).read()
2621 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2622 self._downloader.trouble(u'ERROR: unable to download configuration: ' + u(err))
2625 # Technically, it's JavaScript, not JSON
2626 configJSON = configJSON.replace("'", '"')
2629 config = json.loads(configJSON)
2630 except (ValueError,), err:
2631 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + u(err))
2634 playlist = config['playlist']
2635 videoUrl = playlist[1]['url']
2640 'uploader': showName,
2641 'upload_date': None,
2644 'thumbnail': imgUrl,
2645 'description': description,
2646 'player_url': playerUrl,
2652 class CollegeHumorIE(InfoExtractor):
2653 """Information extractor for collegehumor.com"""
2655 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2656 IE_NAME = u'collegehumor'
2658 def report_webpage(self, video_id):
2659 """Report information extraction."""
2660 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2662 def report_extraction(self, video_id):
2663 """Report information extraction."""
2664 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2666 def _real_extract(self, url):
2667 mobj = re.match(self._VALID_URL, url)
2669 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2671 video_id = mobj.group('videoid')
2673 self.report_webpage(video_id)
2674 request = urllib2.Request(url)
2676 webpage = urllib2.urlopen(request).read()
2677 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2678 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2681 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2683 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2685 internal_video_id = m.group('internalvideoid')
2689 'internal_id': internal_video_id,
2691 'upload_date': None,
2694 self.report_extraction(video_id)
2695 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2697 metaXml = urllib2.urlopen(xmlUrl).read()
2698 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2699 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % u(err))
2702 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2704 videoNode = mdoc.findall('./video')[0]
2705 info['description'] = videoNode.findall('./description')[0].text
2706 info['title'] = videoNode.findall('./caption')[0].text
2707 info['url'] = videoNode.findall('./file')[0].text
2708 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2709 info['ext'] = info['url'].rpartition('.')[2]
2711 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2717 class XVideosIE(InfoExtractor):
2718 """Information extractor for xvideos.com"""
2720 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2721 IE_NAME = u'xvideos'
2723 def report_webpage(self, video_id):
2724 """Report information extraction."""
2725 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2727 def report_extraction(self, video_id):
2728 """Report information extraction."""
2729 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2731 def _real_extract(self, url):
2732 mobj = re.match(self._VALID_URL, url)
2734 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2736 video_id = mobj.group(1).decode('utf-8')
2738 self.report_webpage(video_id)
2740 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2742 webpage = urllib2.urlopen(request).read()
2743 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2744 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2747 self.report_extraction(video_id)
2751 mobj = re.search(r'flv_url=(.+?)&', webpage)
2753 self._downloader.trouble(u'ERROR: unable to extract video url')
2755 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2759 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2761 self._downloader.trouble(u'ERROR: unable to extract video title')
2763 video_title = mobj.group(1).decode('utf-8')
2766 # Extract video thumbnail
2767 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2769 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2771 video_thumbnail = mobj.group(0).decode('utf-8')
2777 'upload_date': None,
2778 'title': video_title,
2780 'thumbnail': video_thumbnail,
2781 'description': None,
2787 class SoundcloudIE(InfoExtractor):
2788 """Information extractor for soundcloud.com
2789 To access the media, the uid of the song and a stream token
2790 must be extracted from the page source and the script must make
2791 a request to media.soundcloud.com/crossdomain.xml. Then
2792 the media can be grabbed by requesting from an url composed
2793 of the stream token and uid
2796 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2797 IE_NAME = u'soundcloud'
2799 def __init__(self, downloader=None):
2800 InfoExtractor.__init__(self, downloader)
2802 def report_webpage(self, video_id):
2803 """Report information extraction."""
2804 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2806 def report_extraction(self, video_id):
2807 """Report information extraction."""
2808 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2810 def _real_extract(self, url):
2811 mobj = re.match(self._VALID_URL, url)
2813 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2816 # extract uploader (which is in the url)
2817 uploader = mobj.group(1).decode('utf-8')
2818 # extract simple title (uploader + slug of song title)
2819 slug_title = mobj.group(2).decode('utf-8')
2820 simple_title = uploader + u'-' + slug_title
2822 self.report_webpage('%s/%s' % (uploader, slug_title))
2824 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2826 webpage = urllib2.urlopen(request).read()
2827 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2828 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2831 self.report_extraction('%s/%s' % (uploader, slug_title))
2833 # extract uid and stream token that soundcloud hands out for access
2834 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2836 video_id = mobj.group(1)
2837 stream_token = mobj.group(2)
2839 # extract unsimplified title
2840 mobj = re.search('"title":"(.*?)",', webpage)
2842 title = mobj.group(1).decode('utf-8')
2844 title = simple_title
2846 # construct media url (with uid/token)
2847 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2848 mediaURL = mediaURL % (video_id, stream_token)
2851 description = u'No description available'
2852 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2854 description = mobj.group(1)
2858 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2861 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2862 except Exception, e:
2863 self._downloader.to_stderr(u(e))
2865 # for soundcloud, a request to a cross domain is required for cookies
2866 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2869 'id': video_id.decode('utf-8'),
2871 'uploader': uploader.decode('utf-8'),
2872 'upload_date': upload_date,
2875 'description': description.decode('utf-8')
2879 class InfoQIE(InfoExtractor):
2880 """Information extractor for infoq.com"""
2882 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2885 def report_webpage(self, video_id):
2886 """Report information extraction."""
2887 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2889 def report_extraction(self, video_id):
2890 """Report information extraction."""
2891 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2893 def _real_extract(self, url):
2894 mobj = re.match(self._VALID_URL, url)
2896 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2899 self.report_webpage(url)
2901 request = urllib2.Request(url)
2903 webpage = urllib2.urlopen(request).read()
2904 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2905 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2908 self.report_extraction(url)
2912 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2914 self._downloader.trouble(u'ERROR: unable to extract video url')
2916 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2920 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2922 self._downloader.trouble(u'ERROR: unable to extract video title')
2924 video_title = mobj.group(1).decode('utf-8')
2926 # Extract description
2927 video_description = u'No description available.'
2928 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2929 if mobj is not None:
2930 video_description = mobj.group(1).decode('utf-8')
2932 video_filename = video_url.split('/')[-1]
2933 video_id, extension = video_filename.split('.')
2939 'upload_date': None,
2940 'title': video_title,
2941 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2943 'description': video_description,
2948 class MixcloudIE(InfoExtractor):
2949 """Information extractor for www.mixcloud.com"""
2950 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2951 IE_NAME = u'mixcloud'
2953 def __init__(self, downloader=None):
2954 InfoExtractor.__init__(self, downloader)
2956 def report_download_json(self, file_id):
2957 """Report JSON download."""
2958 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2960 def report_extraction(self, file_id):
2961 """Report information extraction."""
2962 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2964 def get_urls(self, jsonData, fmt, bitrate='best'):
2965 """Get urls from 'audio_formats' section in json"""
2968 bitrate_list = jsonData[fmt]
2969 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2970 bitrate = max(bitrate_list) # select highest
2972 url_list = jsonData[fmt][bitrate]
2973 except TypeError: # we have no bitrate info.
2974 url_list = jsonData[fmt]
2977 def check_urls(self, url_list):
2978 """Returns 1st active url from list"""
2979 for url in url_list:
2981 urllib2.urlopen(url)
2983 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2988 def _print_formats(self, formats):
2989 print('Available formats:')
2990 for fmt in formats.keys():
2991 for b in formats[fmt]:
2993 ext = formats[fmt][b][0]
2994 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2995 except TypeError: # we have no bitrate info
2996 ext = formats[fmt][0]
2997 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3000 def _real_extract(self, url):
3001 mobj = re.match(self._VALID_URL, url)
3003 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3005 # extract uploader & filename from url
3006 uploader = mobj.group(1).decode('utf-8')
3007 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3009 # construct API request
3010 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3011 # retrieve .json file with links to files
3012 request = urllib2.Request(file_url)
3014 self.report_download_json(file_url)
3015 jsonData = urllib2.urlopen(request).read()
3016 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3017 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % u(err))
3021 json_data = json.loads(jsonData)
3022 player_url = json_data['player_swf_url']
3023 formats = dict(json_data['audio_formats'])
3025 req_format = self._downloader.params.get('format', None)
3028 if self._downloader.params.get('listformats', None):
3029 self._print_formats(formats)
3032 if req_format is None or req_format == 'best':
3033 for format_param in formats.keys():
3034 url_list = self.get_urls(formats, format_param)
3036 file_url = self.check_urls(url_list)
3037 if file_url is not None:
3040 if req_format not in formats.keys():
3041 self._downloader.trouble(u'ERROR: format is not available')
3044 url_list = self.get_urls(formats, req_format)
3045 file_url = self.check_urls(url_list)
3046 format_param = req_format
3049 'id': file_id.decode('utf-8'),
3050 'url': file_url.decode('utf-8'),
3051 'uploader': uploader.decode('utf-8'),
3052 'upload_date': None,
3053 'title': json_data['name'],
3054 'ext': file_url.split('.')[-1].decode('utf-8'),
3055 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3056 'thumbnail': json_data['thumbnail_url'],
3057 'description': json_data['description'],
3058 'player_url': player_url.decode('utf-8'),
3061 class StanfordOpenClassroomIE(InfoExtractor):
3062 """Information extractor for Stanford's Open ClassRoom"""
3064 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3065 IE_NAME = u'stanfordoc'
3067 def report_download_webpage(self, objid):
3068 """Report information extraction."""
3069 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3071 def report_extraction(self, video_id):
3072 """Report information extraction."""
3073 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3075 def _real_extract(self, url):
3076 mobj = re.match(self._VALID_URL, url)
3078 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3081 if mobj.group('course') and mobj.group('video'): # A specific video
3082 course = mobj.group('course')
3083 video = mobj.group('video')
3085 'id': course + '_' + video,
3087 'upload_date': None,
3090 self.report_extraction(info['id'])
3091 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3092 xmlUrl = baseUrl + video + '.xml'
3094 metaXml = urllib2.urlopen(xmlUrl).read()
3095 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3096 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % u(err))
3098 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3100 info['title'] = mdoc.findall('./title')[0].text
3101 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3103 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3105 info['ext'] = info['url'].rpartition('.')[2]
3107 elif mobj.group('course'): # A course page
3108 course = mobj.group('course')
3113 'upload_date': None,
3116 self.report_download_webpage(info['id'])
3118 coursepage = urllib2.urlopen(url).read()
3119 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3120 self._downloader.trouble(u'ERROR: unable to download course info page: ' + u(err))
3123 m = re.search('<h1>([^<]+)</h1>', coursepage)
3125 info['title'] = unescapeHTML(m.group(1))
3127 info['title'] = info['id']
3129 m = re.search('<description>([^<]+)</description>', coursepage)
3131 info['description'] = unescapeHTML(m.group(1))
3133 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3136 'type': 'reference',
3137 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3141 for entry in info['list']:
3142 assert entry['type'] == 'reference'
3143 results += self.extract(entry['url'])
3148 'id': 'Stanford OpenClassroom',
3151 'upload_date': None,
3154 self.report_download_webpage(info['id'])
3155 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3157 rootpage = urllib2.urlopen(rootURL).read()
3158 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3159 self._downloader.trouble(u'ERROR: unable to download course info page: ' + u(err))
3162 info['title'] = info['id']
3164 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3167 'type': 'reference',
3168 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3173 for entry in info['list']:
3174 assert entry['type'] == 'reference'
3175 results += self.extract(entry['url'])
3178 class MTVIE(InfoExtractor):
3179 """Information extractor for MTV.com"""
3181 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3184 def report_webpage(self, video_id):
3185 """Report information extraction."""
3186 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3188 def report_extraction(self, video_id):
3189 """Report information extraction."""
3190 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3192 def _real_extract(self, url):
3193 mobj = re.match(self._VALID_URL, url)
3195 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3197 if not mobj.group('proto'):
3198 url = 'http://' + url
3199 video_id = mobj.group('videoid')
3200 self.report_webpage(video_id)
3202 request = urllib2.Request(url)
3204 webpage = urllib2.urlopen(request).read()
3205 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3206 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
3209 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3211 self._downloader.trouble(u'ERROR: unable to extract song name')
3213 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3214 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3216 self._downloader.trouble(u'ERROR: unable to extract performer')
3218 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3219 video_title = performer + ' - ' + song_name
3221 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3223 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3225 mtvn_uri = mobj.group(1)
3227 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3229 self._downloader.trouble(u'ERROR: unable to extract content id')
3231 content_id = mobj.group(1)
3233 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3234 self.report_extraction(video_id)
3235 request = urllib2.Request(videogen_url)
3237 metadataXml = urllib2.urlopen(request).read()
3238 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3239 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % u(err))
3242 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3243 renditions = mdoc.findall('.//rendition')
3245 # For now, always pick the highest quality.
3246 rendition = renditions[-1]
3249 _,_,ext = rendition.attrib['type'].partition('/')
3250 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3251 video_url = rendition.find('./src').text
3253 self._downloader.trouble('Invalid rendition field.')
3259 'uploader': performer,
3260 'upload_date': None,
3261 'title': video_title,
3269 class YoukuIE(InfoExtractor):
3271 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3274 def __init__(self, downloader=None):
3275 InfoExtractor.__init__(self, downloader)
3277 def report_download_webpage(self, file_id):
3278 """Report webpage download."""
3279 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3281 def report_extraction(self, file_id):
3282 """Report information extraction."""
3283 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3286 nowTime = int(time.time() * 1000)
3287 random1 = random.randint(1000,1998)
3288 random2 = random.randint(1000,9999)
3290 return "%d%d%d" %(nowTime,random1,random2)
3292 def _get_file_ID_mix_string(self, seed):
3294 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3296 for i in range(len(source)):
3297 seed = (seed * 211 + 30031 ) % 65536
3298 index = math.floor(seed / 65536 * len(source) )
3299 mixed.append(source[int(index)])
3300 source.remove(source[int(index)])
3301 #return ''.join(mixed)
3304 def _get_file_id(self, fileId, seed):
3305 mixed = self._get_file_ID_mix_string(seed)
3306 ids = fileId.split('*')
3310 realId.append(mixed[int(ch)])
3311 return ''.join(realId)
3313 def _real_extract(self, url):
3314 mobj = re.match(self._VALID_URL, url)
3316 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3318 video_id = mobj.group('ID')
3320 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3322 request = urllib2.Request(info_url, None, std_headers)
3324 self.report_download_webpage(video_id)
3325 jsondata = urllib2.urlopen(request).read()
3326 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3327 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
3330 self.report_extraction(video_id)
3332 config = json.loads(jsondata)
3334 video_title = config['data'][0]['title']
3335 seed = config['data'][0]['seed']
3337 format = self._downloader.params.get('format', None)
3338 supported_format = config['data'][0]['streamfileids'].keys()
3340 if format is None or format == 'best':
3341 if 'hd2' in supported_format:
3346 elif format == 'worst':
3354 fileid = config['data'][0]['streamfileids'][format]
3355 seg_number = len(config['data'][0]['segs'][format])
3358 for i in xrange(seg_number):
3359 keys.append(config['data'][0]['segs'][format][i]['k'])
3362 #youku only could be viewed from mainland china
3364 self._downloader.trouble(u'ERROR: unable to extract info section')
3368 sid = self._gen_sid()
3369 fileid = self._get_file_id(fileid, seed)
3371 #column 8,9 of fileid represent the segment number
3372 #fileid[7:9] should be changed
3373 for index, key in enumerate(keys):
3375 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3376 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3379 'id': '%s_part%02d' % (video_id, index),
3380 'url': download_url,
3382 'upload_date': None,
3383 'title': video_title,
3386 files_info.append(info)
3391 class XNXXIE(InfoExtractor):
3392 """Information extractor for xnxx.com"""
3394 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3396 VIDEO_URL_RE = r'flv_url=(.*?)&'
3397 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3398 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3400 def report_webpage(self, video_id):
3401 """Report information extraction"""
3402 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3404 def report_extraction(self, video_id):
3405 """Report information extraction"""
3406 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3408 def _real_extract(self, url):
3409 mobj = re.match(self._VALID_URL, url)
3411 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3413 video_id = mobj.group(1).decode('utf-8')
3415 self.report_webpage(video_id)
3417 # Get webpage content
3419 webpage = urllib2.urlopen(url).read()
3420 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3421 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3424 result = re.search(self.VIDEO_URL_RE, webpage)
3426 self._downloader.trouble(u'ERROR: unable to extract video url')
3428 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3430 result = re.search(self.VIDEO_TITLE_RE, webpage)
3432 self._downloader.trouble(u'ERROR: unable to extract video title')
3434 video_title = result.group(1).decode('utf-8')
3436 result = re.search(self.VIDEO_THUMB_RE, webpage)
3438 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3440 video_thumbnail = result.group(1).decode('utf-8')
3446 'upload_date': None,
3447 'title': video_title,
3449 'thumbnail': video_thumbnail,
3450 'description': None,
3454 class GooglePlusIE(InfoExtractor):
3455 """Information extractor for plus.google.com."""
3457 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3458 IE_NAME = u'plus.google'
3460 def __init__(self, downloader=None):
3461 InfoExtractor.__init__(self, downloader)
3463 def report_extract_entry(self, url):
3464 """Report downloading extry"""
3465 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3467 def report_date(self, upload_date):
3468 """Report downloading extry"""
3469 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3471 def report_uploader(self, uploader):
3472 """Report downloading extry"""
3473 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3475 def report_title(self, video_title):
3476 """Report downloading extry"""
3477 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3479 def report_extract_vid_page(self, video_page):
3480 """Report information extraction."""
3481 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3483 def _real_extract(self, url):
3484 # Extract id from URL
3485 mobj = re.match(self._VALID_URL, url)
3487 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3490 post_url = mobj.group(0)
3491 video_id = mobj.group(2)
3493 video_extension = 'flv'
3495 # Step 1, Retrieve post webpage to extract further information
3496 self.report_extract_entry(post_url)
3497 request = urllib2.Request(post_url)
3499 webpage = urllib2.urlopen(request).read()
3500 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3501 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % u(err))
3504 # Extract update date
3506 pattern = 'title="Timestamp">(.*?)</a>'
3507 mobj = re.search(pattern, webpage)
3509 upload_date = mobj.group(1)
3510 # Convert timestring to a format suitable for filename
3511 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3512 upload_date = upload_date.strftime('%Y%m%d')
3513 self.report_date(upload_date)
3517 pattern = r'rel\="author".*?>(.*?)</a>'
3518 mobj = re.search(pattern, webpage)
3520 uploader = mobj.group(1)
3521 self.report_uploader(uploader)
3524 # Get the first line for title
3526 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3527 mobj = re.search(pattern, webpage)
3529 video_title = mobj.group(1)
3530 self.report_title(video_title)
3532 # Step 2, Stimulate clicking the image box to launch video
3533 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3534 mobj = re.search(pattern, webpage)
3536 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3538 video_page = mobj.group(1)
3539 request = urllib2.Request(video_page)
3541 webpage = urllib2.urlopen(request).read()
3542 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3543 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
3545 self.report_extract_vid_page(video_page)
3548 # Extract video links on video page
3549 """Extract video links of all sizes"""
3550 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3551 mobj = re.findall(pattern, webpage)
3553 self._downloader.trouble(u'ERROR: unable to extract video links')
3555 # Sort in resolution
3556 links = sorted(mobj)
3558 # Choose the lowest of the sort, i.e. highest resolution
3559 video_url = links[-1]
3560 # Only get the url. The resolution part in the tuple has no use anymore
3561 video_url = video_url[-1]
3562 # Treat escaped \u0026 style hex
3563 video_url = unicode(video_url, "unicode_escape")
3567 'id': video_id.decode('utf-8'),
3569 'uploader': uploader.decode('utf-8'),
3570 'upload_date': upload_date.decode('utf-8'),
3571 'title': video_title.decode('utf-8'),
3572 'ext': video_extension.decode('utf-8'),