10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.facebook import FacebookIE
27 from .extractor.gametrailers import GametrailersIE
28 from .extractor.generic import GenericIE
29 from .extractor.googleplus import GooglePlusIE
30 from .extractor.googlesearch import GoogleSearchIE
31 from .extractor.metacafe import MetacafeIE
32 from .extractor.myvideo import MyVideoIE
33 from .extractor.statigram import StatigramIE
34 from .extractor.photobucket import PhotobucketIE
35 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
36 from .extractor.vimeo import VimeoIE
37 from .extractor.yahoo import YahooIE, YahooSearchIE
38 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
39 from .extractor.zdf import ZDFIE
59 class DepositFilesIE(InfoExtractor):
60 """Information extractor for depositfiles.com"""
62 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
64 def _real_extract(self, url):
65 file_id = url.split('/')[-1]
66 # Rebuild url in english locale
67 url = 'http://depositfiles.com/en/files/' + file_id
69 # Retrieve file webpage with 'Free download' button pressed
70 free_download_indication = { 'gateway_result' : '1' }
71 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
73 self.report_download_webpage(file_id)
74 webpage = compat_urllib_request.urlopen(request).read()
75 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
76 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
78 # Search for the real file URL
79 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
80 if (mobj is None) or (mobj.group(1) is None):
81 # Try to figure out reason of the error.
82 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
83 if (mobj is not None) and (mobj.group(1) is not None):
84 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
85 raise ExtractorError(u'%s' % restriction_message)
87 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
89 file_url = mobj.group(1)
90 file_extension = os.path.splitext(file_url)[1][1:]
92 # Search for file title
93 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
96 'id': file_id.decode('utf-8'),
97 'url': file_url.decode('utf-8'),
101 'ext': file_extension.decode('utf-8'),
112 class EscapistIE(InfoExtractor):
113 """Information extractor for The Escapist """
115 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
116 IE_NAME = u'escapist'
118 def _real_extract(self, url):
119 mobj = re.match(self._VALID_URL, url)
121 raise ExtractorError(u'Invalid URL: %s' % url)
122 showName = mobj.group('showname')
123 videoId = mobj.group('episode')
125 self.report_extraction(videoId)
126 webpage = self._download_webpage(url, videoId)
128 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
129 webpage, u'description', fatal=False)
131 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
132 webpage, u'thumbnail', fatal=False)
134 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
135 webpage, u'player url')
137 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
138 webpage, u'player url').split(' : ')[-1]
140 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
141 configUrl = compat_urllib_parse.unquote(configUrl)
143 configJSON = self._download_webpage(configUrl, videoId,
144 u'Downloading configuration',
145 u'unable to download configuration')
147 # Technically, it's JavaScript, not JSON
148 configJSON = configJSON.replace("'", '"')
151 config = json.loads(configJSON)
152 except (ValueError,) as err:
153 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
155 playlist = config['playlist']
156 videoUrl = playlist[1]['url']
161 'uploader': showName,
166 'description': videoDesc,
167 'player_url': playerUrl,
172 class CollegeHumorIE(InfoExtractor):
173 """Information extractor for collegehumor.com"""
176 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
177 IE_NAME = u'collegehumor'
179 def report_manifest(self, video_id):
180 """Report information extraction."""
181 self.to_screen(u'%s: Downloading XML manifest' % video_id)
183 def _real_extract(self, url):
184 mobj = re.match(self._VALID_URL, url)
186 raise ExtractorError(u'Invalid URL: %s' % url)
187 video_id = mobj.group('videoid')
195 self.report_extraction(video_id)
196 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
198 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
199 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
200 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
202 mdoc = xml.etree.ElementTree.fromstring(metaXml)
204 videoNode = mdoc.findall('./video')[0]
205 info['description'] = videoNode.findall('./description')[0].text
206 info['title'] = videoNode.findall('./caption')[0].text
207 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
208 manifest_url = videoNode.findall('./file')[0].text
210 raise ExtractorError(u'Invalid metadata XML file')
212 manifest_url += '?hdcore=2.10.3'
213 self.report_manifest(video_id)
215 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
216 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
217 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
219 adoc = xml.etree.ElementTree.fromstring(manifestXml)
221 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
222 node_id = media_node.attrib['url']
223 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
224 except IndexError as err:
225 raise ExtractorError(u'Invalid manifest file')
227 url_pr = compat_urllib_parse_urlparse(manifest_url)
228 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
235 class XVideosIE(InfoExtractor):
236 """Information extractor for xvideos.com"""
238 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
241 def _real_extract(self, url):
242 mobj = re.match(self._VALID_URL, url)
244 raise ExtractorError(u'Invalid URL: %s' % url)
245 video_id = mobj.group(1)
247 webpage = self._download_webpage(url, video_id)
249 self.report_extraction(video_id)
252 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
253 webpage, u'video URL'))
256 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
259 # Extract video thumbnail
260 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
261 webpage, u'thumbnail', fatal=False)
268 'title': video_title,
270 'thumbnail': video_thumbnail,
279 class InfoQIE(InfoExtractor):
280 """Information extractor for infoq.com"""
281 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
283 def _real_extract(self, url):
284 mobj = re.match(self._VALID_URL, url)
286 raise ExtractorError(u'Invalid URL: %s' % url)
288 webpage = self._download_webpage(url, video_id=url)
289 self.report_extraction(url)
292 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
294 raise ExtractorError(u'Unable to extract video url')
295 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
296 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
299 video_title = self._search_regex(r'contentTitle = "(.*?)";',
302 # Extract description
303 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
304 webpage, u'description', fatal=False)
306 video_filename = video_url.split('/')[-1]
307 video_id, extension = video_filename.split('.')
314 'title': video_title,
315 'ext': extension, # Extension is always(?) mp4, but seems to be flv
317 'description': video_description,
322 class MixcloudIE(InfoExtractor):
323 """Information extractor for www.mixcloud.com"""
325 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
326 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
327 IE_NAME = u'mixcloud'
329 def report_download_json(self, file_id):
330 """Report JSON download."""
331 self.to_screen(u'Downloading json')
333 def get_urls(self, jsonData, fmt, bitrate='best'):
334 """Get urls from 'audio_formats' section in json"""
337 bitrate_list = jsonData[fmt]
338 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
339 bitrate = max(bitrate_list) # select highest
341 url_list = jsonData[fmt][bitrate]
342 except TypeError: # we have no bitrate info.
343 url_list = jsonData[fmt]
346 def check_urls(self, url_list):
347 """Returns 1st active url from list"""
350 compat_urllib_request.urlopen(url)
352 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
357 def _print_formats(self, formats):
358 print('Available formats:')
359 for fmt in formats.keys():
360 for b in formats[fmt]:
362 ext = formats[fmt][b][0]
363 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
364 except TypeError: # we have no bitrate info
365 ext = formats[fmt][0]
366 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
369 def _real_extract(self, url):
370 mobj = re.match(self._VALID_URL, url)
372 raise ExtractorError(u'Invalid URL: %s' % url)
373 # extract uploader & filename from url
374 uploader = mobj.group(1).decode('utf-8')
375 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
377 # construct API request
378 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
379 # retrieve .json file with links to files
380 request = compat_urllib_request.Request(file_url)
382 self.report_download_json(file_url)
383 jsonData = compat_urllib_request.urlopen(request).read()
384 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
385 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
388 json_data = json.loads(jsonData)
389 player_url = json_data['player_swf_url']
390 formats = dict(json_data['audio_formats'])
392 req_format = self._downloader.params.get('format', None)
395 if self._downloader.params.get('listformats', None):
396 self._print_formats(formats)
399 if req_format is None or req_format == 'best':
400 for format_param in formats.keys():
401 url_list = self.get_urls(formats, format_param)
403 file_url = self.check_urls(url_list)
404 if file_url is not None:
407 if req_format not in formats:
408 raise ExtractorError(u'Format is not available')
410 url_list = self.get_urls(formats, req_format)
411 file_url = self.check_urls(url_list)
412 format_param = req_format
415 'id': file_id.decode('utf-8'),
416 'url': file_url.decode('utf-8'),
417 'uploader': uploader.decode('utf-8'),
419 'title': json_data['name'],
420 'ext': file_url.split('.')[-1].decode('utf-8'),
421 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
422 'thumbnail': json_data['thumbnail_url'],
423 'description': json_data['description'],
424 'player_url': player_url.decode('utf-8'),
427 class StanfordOpenClassroomIE(InfoExtractor):
428 """Information extractor for Stanford's Open ClassRoom"""
430 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
431 IE_NAME = u'stanfordoc'
433 def _real_extract(self, url):
434 mobj = re.match(self._VALID_URL, url)
436 raise ExtractorError(u'Invalid URL: %s' % url)
438 if mobj.group('course') and mobj.group('video'): # A specific video
439 course = mobj.group('course')
440 video = mobj.group('video')
442 'id': course + '_' + video,
447 self.report_extraction(info['id'])
448 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
449 xmlUrl = baseUrl + video + '.xml'
451 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
453 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
454 mdoc = xml.etree.ElementTree.fromstring(metaXml)
456 info['title'] = mdoc.findall('./title')[0].text
457 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
459 raise ExtractorError(u'Invalid metadata XML file')
460 info['ext'] = info['url'].rpartition('.')[2]
462 elif mobj.group('course'): # A course page
463 course = mobj.group('course')
471 coursepage = self._download_webpage(url, info['id'],
472 note='Downloading course info page',
473 errnote='Unable to download course info page')
475 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
477 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
478 coursepage, u'description', fatal=False)
480 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
484 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
488 for entry in info['list']:
489 assert entry['type'] == 'reference'
490 results += self.extract(entry['url'])
494 'id': 'Stanford OpenClassroom',
500 self.report_download_webpage(info['id'])
501 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
503 rootpage = compat_urllib_request.urlopen(rootURL).read()
504 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
505 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
507 info['title'] = info['id']
509 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
513 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
518 for entry in info['list']:
519 assert entry['type'] == 'reference'
520 results += self.extract(entry['url'])
523 class MTVIE(InfoExtractor):
524 """Information extractor for MTV.com"""
526 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
529 def _real_extract(self, url):
530 mobj = re.match(self._VALID_URL, url)
532 raise ExtractorError(u'Invalid URL: %s' % url)
533 if not mobj.group('proto'):
534 url = 'http://' + url
535 video_id = mobj.group('videoid')
537 webpage = self._download_webpage(url, video_id)
539 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
540 webpage, u'song name', fatal=False)
542 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
545 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
546 webpage, u'mtvn_uri', fatal=False)
548 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
549 webpage, u'content id', fatal=False)
551 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
552 self.report_extraction(video_id)
553 request = compat_urllib_request.Request(videogen_url)
555 metadataXml = compat_urllib_request.urlopen(request).read()
556 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
557 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
559 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
560 renditions = mdoc.findall('.//rendition')
562 # For now, always pick the highest quality.
563 rendition = renditions[-1]
566 _,_,ext = rendition.attrib['type'].partition('/')
567 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
568 video_url = rendition.find('./src').text
570 raise ExtractorError('Invalid rendition field.')
575 'uploader': performer,
577 'title': video_title,
585 class YoukuIE(InfoExtractor):
586 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
589 nowTime = int(time.time() * 1000)
590 random1 = random.randint(1000,1998)
591 random2 = random.randint(1000,9999)
593 return "%d%d%d" %(nowTime,random1,random2)
595 def _get_file_ID_mix_string(self, seed):
597 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
599 for i in range(len(source)):
600 seed = (seed * 211 + 30031 ) % 65536
601 index = math.floor(seed / 65536 * len(source) )
602 mixed.append(source[int(index)])
603 source.remove(source[int(index)])
604 #return ''.join(mixed)
607 def _get_file_id(self, fileId, seed):
608 mixed = self._get_file_ID_mix_string(seed)
609 ids = fileId.split('*')
613 realId.append(mixed[int(ch)])
614 return ''.join(realId)
616 def _real_extract(self, url):
617 mobj = re.match(self._VALID_URL, url)
619 raise ExtractorError(u'Invalid URL: %s' % url)
620 video_id = mobj.group('ID')
622 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
624 jsondata = self._download_webpage(info_url, video_id)
626 self.report_extraction(video_id)
628 config = json.loads(jsondata)
630 video_title = config['data'][0]['title']
631 seed = config['data'][0]['seed']
633 format = self._downloader.params.get('format', None)
634 supported_format = list(config['data'][0]['streamfileids'].keys())
636 if format is None or format == 'best':
637 if 'hd2' in supported_format:
642 elif format == 'worst':
650 fileid = config['data'][0]['streamfileids'][format]
651 keys = [s['k'] for s in config['data'][0]['segs'][format]]
652 except (UnicodeDecodeError, ValueError, KeyError):
653 raise ExtractorError(u'Unable to extract info section')
656 sid = self._gen_sid()
657 fileid = self._get_file_id(fileid, seed)
659 #column 8,9 of fileid represent the segment number
660 #fileid[7:9] should be changed
661 for index, key in enumerate(keys):
663 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
664 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
667 'id': '%s_part%02d' % (video_id, index),
671 'title': video_title,
674 files_info.append(info)
679 class XNXXIE(InfoExtractor):
680 """Information extractor for xnxx.com"""
682 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
684 VIDEO_URL_RE = r'flv_url=(.*?)&'
685 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
686 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
688 def _real_extract(self, url):
689 mobj = re.match(self._VALID_URL, url)
691 raise ExtractorError(u'Invalid URL: %s' % url)
692 video_id = mobj.group(1)
694 # Get webpage content
695 webpage = self._download_webpage(url, video_id)
697 video_url = self._search_regex(self.VIDEO_URL_RE,
698 webpage, u'video URL')
699 video_url = compat_urllib_parse.unquote(video_url)
701 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
704 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
705 webpage, u'thumbnail', fatal=False)
712 'title': video_title,
714 'thumbnail': video_thumbnail,
720 class NBAIE(InfoExtractor):
721 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
724 def _real_extract(self, url):
725 mobj = re.match(self._VALID_URL, url)
727 raise ExtractorError(u'Invalid URL: %s' % url)
729 video_id = mobj.group(1)
731 webpage = self._download_webpage(url, video_id)
733 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
735 shortened_video_id = video_id.rpartition('/')[2]
736 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
737 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
739 # It isn't there in the HTML it returns to us
740 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
742 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
745 'id': shortened_video_id,
749 # 'uploader_date': uploader_date,
750 'description': description,
754 class JustinTVIE(InfoExtractor):
755 """Information extractor for justin.tv and twitch.tv"""
756 # TODO: One broadcast may be split into multiple videos. The key
757 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
758 # starts at 1 and increases. Can we treat all parts as one video?
760 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
762 (?P<channelid>[^/]+)|
763 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
764 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
768 _JUSTIN_PAGE_LIMIT = 100
769 IE_NAME = u'justin.tv'
771 def report_download_page(self, channel, offset):
772 """Report attempt to download a single page of videos."""
773 self.to_screen(u'%s: Downloading video information from %d to %d' %
774 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
776 # Return count of items, list of *valid* items
777 def _parse_page(self, url, video_id):
778 webpage = self._download_webpage(url, video_id,
779 u'Downloading video info JSON',
780 u'unable to download video info JSON')
782 response = json.loads(webpage)
783 if type(response) != list:
784 error_text = response.get('error', 'unknown error')
785 raise ExtractorError(u'Justin.tv API: %s' % error_text)
787 for clip in response:
788 video_url = clip['video_file_url']
790 video_extension = os.path.splitext(video_url)[1][1:]
791 video_date = re.sub('-', '', clip['start_time'][:10])
792 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
793 video_id = clip['id']
794 video_title = clip.get('title', video_id)
798 'title': video_title,
799 'uploader': clip.get('channel_name', video_uploader_id),
800 'uploader_id': video_uploader_id,
801 'upload_date': video_date,
802 'ext': video_extension,
804 return (len(response), info)
806 def _real_extract(self, url):
807 mobj = re.match(self._VALID_URL, url)
809 raise ExtractorError(u'invalid URL: %s' % url)
811 api_base = 'http://api.justin.tv'
813 if mobj.group('channelid'):
815 video_id = mobj.group('channelid')
816 api = api_base + '/channel/archives/%s.json' % video_id
817 elif mobj.group('chapterid'):
818 chapter_id = mobj.group('chapterid')
820 webpage = self._download_webpage(url, chapter_id)
821 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
823 raise ExtractorError(u'Cannot find archive of a chapter')
824 archive_id = m.group(1)
826 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
827 chapter_info_xml = self._download_webpage(api, chapter_id,
828 note=u'Downloading chapter information',
829 errnote=u'Chapter information download failed')
830 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
831 for a in doc.findall('.//archive'):
832 if archive_id == a.find('./id').text:
835 raise ExtractorError(u'Could not find chapter in chapter information')
837 video_url = a.find('./video_file_url').text
838 video_ext = video_url.rpartition('.')[2] or u'flv'
840 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
841 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
842 note='Downloading chapter metadata',
843 errnote='Download of chapter metadata failed')
844 chapter_info = json.loads(chapter_info_json)
846 bracket_start = int(doc.find('.//bracket_start').text)
847 bracket_end = int(doc.find('.//bracket_end').text)
849 # TODO determine start (and probably fix up file)
850 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
851 #video_url += u'?start=' + TODO:start_timestamp
852 # bracket_start is 13290, but we want 51670615
853 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
854 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
857 'id': u'c' + chapter_id,
860 'title': chapter_info['title'],
861 'thumbnail': chapter_info['preview'],
862 'description': chapter_info['description'],
863 'uploader': chapter_info['channel']['display_name'],
864 'uploader_id': chapter_info['channel']['name'],
868 video_id = mobj.group('videoid')
869 api = api_base + '/broadcast/by_archive/%s.json' % video_id
871 self.report_extraction(video_id)
875 limit = self._JUSTIN_PAGE_LIMIT
878 self.report_download_page(video_id, offset)
879 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
880 page_count, page_info = self._parse_page(page_url, video_id)
881 info.extend(page_info)
882 if not paged or page_count != limit:
887 class FunnyOrDieIE(InfoExtractor):
888 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
890 def _real_extract(self, url):
891 mobj = re.match(self._VALID_URL, url)
893 raise ExtractorError(u'invalid URL: %s' % url)
895 video_id = mobj.group('id')
896 webpage = self._download_webpage(url, video_id)
898 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
899 webpage, u'video URL', flags=re.DOTALL)
901 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
902 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
904 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
905 webpage, u'description', fatal=False, flags=re.DOTALL)
912 'description': video_description,
916 class SteamIE(InfoExtractor):
917 _VALID_URL = r"""http://store\.steampowered\.com/
919 (?P<urltype>video|app)/ #If the page is only for videos or for a game
921 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
923 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
924 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
927 def suitable(cls, url):
928 """Receives a URL and returns True if suitable for this IE."""
929 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
931 def _real_extract(self, url):
932 m = re.match(self._VALID_URL, url, re.VERBOSE)
933 gameID = m.group('gameID')
935 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
936 webpage = self._download_webpage(videourl, gameID)
938 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
939 videourl = self._AGECHECK_TEMPLATE % gameID
940 self.report_age_confirmation()
941 webpage = self._download_webpage(videourl, gameID)
943 self.report_extraction(gameID)
944 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
945 webpage, 'game title')
947 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
948 mweb = re.finditer(urlRE, webpage)
949 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
950 titles = re.finditer(namesRE, webpage)
951 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
952 thumbs = re.finditer(thumbsRE, webpage)
954 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
955 video_id = vid.group('videoID')
956 title = vtitle.group('videoName')
957 video_url = vid.group('videoURL')
958 video_thumb = thumb.group('thumbnail')
960 raise ExtractorError(u'Cannot find video url for %s' % video_id)
965 'title': unescapeHTML(title),
966 'thumbnail': video_thumb
969 return [self.playlist_result(videos, gameID, game_title)]
971 class UstreamIE(InfoExtractor):
972 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
975 def _real_extract(self, url):
976 m = re.match(self._VALID_URL, url)
977 video_id = m.group('videoID')
979 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
980 webpage = self._download_webpage(url, video_id)
982 self.report_extraction(video_id)
984 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
987 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
988 webpage, u'uploader', fatal=False, flags=re.DOTALL)
990 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
991 webpage, u'thumbnail', fatal=False)
997 'title': video_title,
998 'uploader': uploader,
999 'thumbnail': thumbnail,
1003 class WorldStarHipHopIE(InfoExtractor):
1004 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1005 IE_NAME = u'WorldStarHipHop'
1007 def _real_extract(self, url):
1008 m = re.match(self._VALID_URL, url)
1009 video_id = m.group('id')
1011 webpage_src = self._download_webpage(url, video_id)
1013 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1014 webpage_src, u'video URL')
1016 if 'mp4' in video_url:
1021 video_title = self._html_search_regex(r"<title>(.*)</title>",
1022 webpage_src, u'title')
1024 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1025 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1026 webpage_src, u'thumbnail', fatal=False)
1029 _title = r"""candytitles.*>(.*)</span>"""
1030 mobj = re.search(_title, webpage_src)
1031 if mobj is not None:
1032 video_title = mobj.group(1)
1037 'title' : video_title,
1038 'thumbnail' : thumbnail,
1043 class RBMARadioIE(InfoExtractor):
1044 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1046 def _real_extract(self, url):
1047 m = re.match(self._VALID_URL, url)
1048 video_id = m.group('videoID')
1050 webpage = self._download_webpage(url, video_id)
1052 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1053 webpage, u'json data', flags=re.MULTILINE)
1056 data = json.loads(json_data)
1057 except ValueError as e:
1058 raise ExtractorError(u'Invalid JSON: ' + str(e))
1060 video_url = data['akamai_url'] + '&cbr=256'
1061 url_parts = compat_urllib_parse_urlparse(video_url)
1062 video_ext = url_parts.path.rpartition('.')[2]
1067 'title': data['title'],
1068 'description': data.get('teaser_text'),
1069 'location': data.get('country_of_origin'),
1070 'uploader': data.get('host', {}).get('name'),
1071 'uploader_id': data.get('host', {}).get('slug'),
1072 'thumbnail': data.get('image', {}).get('large_url_2x'),
1073 'duration': data.get('duration'),
1078 class YouPornIE(InfoExtractor):
1079 """Information extractor for youporn.com."""
1080 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1082 def _print_formats(self, formats):
1083 """Print all available formats"""
1084 print(u'Available formats:')
1085 print(u'ext\t\tformat')
1086 print(u'---------------------------------')
1087 for format in formats:
1088 print(u'%s\t\t%s' % (format['ext'], format['format']))
1090 def _specific(self, req_format, formats):
1092 if(x["format"]==req_format):
1096 def _real_extract(self, url):
1097 mobj = re.match(self._VALID_URL, url)
1099 raise ExtractorError(u'Invalid URL: %s' % url)
1100 video_id = mobj.group('videoid')
1102 req = compat_urllib_request.Request(url)
1103 req.add_header('Cookie', 'age_verified=1')
1104 webpage = self._download_webpage(req, video_id)
1106 # Get JSON parameters
1107 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1109 params = json.loads(json_params)
1111 raise ExtractorError(u'Invalid JSON')
1113 self.report_extraction(video_id)
1115 video_title = params['title']
1116 upload_date = unified_strdate(params['release_date_f'])
1117 video_description = params['description']
1118 video_uploader = params['submitted_by']
1119 thumbnail = params['thumbnails'][0]['image']
1121 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1123 # Get all of the formats available
1124 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1125 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1126 webpage, u'download list').strip()
1128 # Get all of the links from the page
1129 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1130 links = re.findall(LINK_RE, download_list_html)
1131 if(len(links) == 0):
1132 raise ExtractorError(u'ERROR: no known formats available for video')
1134 self.to_screen(u'Links found: %d' % len(links))
1139 # A link looks like this:
1140 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1141 # A path looks like this:
1142 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1143 video_url = unescapeHTML( link )
1144 path = compat_urllib_parse_urlparse( video_url ).path
1145 extension = os.path.splitext( path )[1][1:]
1146 format = path.split('/')[4].split('_')[:2]
1149 format = "-".join( format )
1150 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1155 'uploader': video_uploader,
1156 'upload_date': upload_date,
1157 'title': video_title,
1160 'thumbnail': thumbnail,
1161 'description': video_description
1164 if self._downloader.params.get('listformats', None):
1165 self._print_formats(formats)
1168 req_format = self._downloader.params.get('format', None)
1169 self.to_screen(u'Format: %s' % req_format)
1171 if req_format is None or req_format == 'best':
1173 elif req_format == 'worst':
1174 return [formats[-1]]
1175 elif req_format in ('-1', 'all'):
1178 format = self._specific( req_format, formats )
1180 raise ExtractorError(u'Requested format not available')
1185 class PornotubeIE(InfoExtractor):
1186 """Information extractor for pornotube.com."""
1187 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1189 def _real_extract(self, url):
1190 mobj = re.match(self._VALID_URL, url)
1192 raise ExtractorError(u'Invalid URL: %s' % url)
1194 video_id = mobj.group('videoid')
1195 video_title = mobj.group('title')
1197 # Get webpage content
1198 webpage = self._download_webpage(url, video_id)
1201 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1202 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1203 video_url = compat_urllib_parse.unquote(video_url)
1205 #Get the uploaded date
1206 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1207 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1208 if upload_date: upload_date = unified_strdate(upload_date)
1210 info = {'id': video_id,
1213 'upload_date': upload_date,
1214 'title': video_title,
1220 class YouJizzIE(InfoExtractor):
1221 """Information extractor for youjizz.com."""
1222 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1224 def _real_extract(self, url):
1225 mobj = re.match(self._VALID_URL, url)
1227 raise ExtractorError(u'Invalid URL: %s' % url)
1229 video_id = mobj.group('videoid')
1231 # Get webpage content
1232 webpage = self._download_webpage(url, video_id)
1234 # Get the video title
1235 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1236 webpage, u'title').strip()
1238 # Get the embed page
1239 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1241 raise ExtractorError(u'ERROR: unable to extract embed page')
1243 embed_page_url = result.group(0).strip()
1244 video_id = result.group('videoid')
1246 webpage = self._download_webpage(embed_page_url, video_id)
1249 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1250 webpage, u'video URL')
1252 info = {'id': video_id,
1254 'title': video_title,
1257 'player_url': embed_page_url}
1261 class EightTracksIE(InfoExtractor):
1263 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1265 def _real_extract(self, url):
1266 mobj = re.match(self._VALID_URL, url)
1268 raise ExtractorError(u'Invalid URL: %s' % url)
1269 playlist_id = mobj.group('id')
1271 webpage = self._download_webpage(url, playlist_id)
1273 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1274 data = json.loads(json_like)
1276 session = str(random.randint(0, 1000000000))
1278 track_count = data['tracks_count']
1279 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1280 next_url = first_url
1282 for i in itertools.count():
1283 api_json = self._download_webpage(next_url, playlist_id,
1284 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1285 errnote=u'Failed to download song information')
1286 api_data = json.loads(api_json)
1287 track_data = api_data[u'set']['track']
1289 'id': track_data['id'],
1290 'url': track_data['track_file_stream_url'],
1291 'title': track_data['performer'] + u' - ' + track_data['name'],
1292 'raw_title': track_data['name'],
1293 'uploader_id': data['user']['login'],
1297 if api_data['set']['at_last_track']:
1299 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1302 class KeekIE(InfoExtractor):
1303 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1306 def _real_extract(self, url):
1307 m = re.match(self._VALID_URL, url)
1308 video_id = m.group('videoID')
1310 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1311 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1312 webpage = self._download_webpage(url, video_id)
1314 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1317 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1318 webpage, u'uploader', fatal=False)
1324 'title': video_title,
1325 'thumbnail': thumbnail,
1326 'uploader': uploader
1330 class TEDIE(InfoExtractor):
1331 _VALID_URL=r'''http://www\.ted\.com/
1333 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1335 ((?P<type_talk>talks)) # We have a simple talk
1337 (/lang/(.*?))? # The url may contain the language
1338 /(?P<name>\w+) # Here goes the name and then ".html"
1342 def suitable(cls, url):
1343 """Receives a URL and returns True if suitable for this IE."""
1344 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1346 def _real_extract(self, url):
1347 m=re.match(self._VALID_URL, url, re.VERBOSE)
1348 if m.group('type_talk'):
1349 return [self._talk_info(url)]
1351 playlist_id=m.group('playlist_id')
1352 name=m.group('name')
1353 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1354 return [self._playlist_videos_info(url,name,playlist_id)]
1356 def _playlist_videos_info(self,url,name,playlist_id=0):
1357 '''Returns the videos of the playlist'''
1359 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1360 ([.\s]*?)data-playlist_item_id="(\d+)"
1361 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1363 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1364 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1365 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1366 m_names=re.finditer(video_name_RE,webpage)
1368 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1369 webpage, 'playlist title')
1371 playlist_entries = []
1372 for m_video, m_name in zip(m_videos,m_names):
1373 video_id=m_video.group('video_id')
1374 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1375 playlist_entries.append(self.url_result(talk_url, 'TED'))
1376 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1378 def _talk_info(self, url, video_id=0):
1379 """Return the video for the talk in the url"""
1380 m = re.match(self._VALID_URL, url,re.VERBOSE)
1381 video_name = m.group('name')
1382 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1383 self.report_extraction(video_name)
1384 # If the url includes the language we get the title translated
1385 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1387 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1388 webpage, 'json data')
1389 info = json.loads(json_data)
1390 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1391 webpage, 'description', flags = re.DOTALL)
1393 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1394 webpage, 'thumbnail')
1397 'url': info['htmlStreams'][-1]['file'],
1400 'thumbnail': thumbnail,
1401 'description': desc,
1405 class MySpassIE(InfoExtractor):
1406 _VALID_URL = r'http://www.myspass.de/.*'
1408 def _real_extract(self, url):
1409 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1411 # video id is the last path element of the URL
1412 # usually there is a trailing slash, so also try the second but last
1413 url_path = compat_urllib_parse_urlparse(url).path
1414 url_parent_path, video_id = os.path.split(url_path)
1416 _, video_id = os.path.split(url_parent_path)
1419 metadata_url = META_DATA_URL_TEMPLATE % video_id
1420 metadata_text = self._download_webpage(metadata_url, video_id)
1421 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1423 # extract values from metadata
1424 url_flv_el = metadata.find('url_flv')
1425 if url_flv_el is None:
1426 raise ExtractorError(u'Unable to extract download url')
1427 video_url = url_flv_el.text
1428 extension = os.path.splitext(video_url)[1][1:]
1429 title_el = metadata.find('title')
1430 if title_el is None:
1431 raise ExtractorError(u'Unable to extract title')
1432 title = title_el.text
1433 format_id_el = metadata.find('format_id')
1434 if format_id_el is None:
1437 format = format_id_el.text
1438 description_el = metadata.find('description')
1439 if description_el is not None:
1440 description = description_el.text
1443 imagePreview_el = metadata.find('imagePreview')
1444 if imagePreview_el is not None:
1445 thumbnail = imagePreview_el.text
1454 'thumbnail': thumbnail,
1455 'description': description
1459 class SpiegelIE(InfoExtractor):
1460 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1462 def _real_extract(self, url):
1463 m = re.match(self._VALID_URL, url)
1464 video_id = m.group('videoID')
1466 webpage = self._download_webpage(url, video_id)
1468 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1471 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1472 xml_code = self._download_webpage(xml_url, video_id,
1473 note=u'Downloading XML', errnote=u'Failed to download XML')
1475 idoc = xml.etree.ElementTree.fromstring(xml_code)
1476 last_type = idoc[-1]
1477 filename = last_type.findall('./filename')[0].text
1478 duration = float(last_type.findall('./duration')[0].text)
1480 video_url = 'http://video2.spiegel.de/flash/' + filename
1481 video_ext = filename.rpartition('.')[2]
1486 'title': video_title,
1487 'duration': duration,
1491 class LiveLeakIE(InfoExtractor):
1493 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1494 IE_NAME = u'liveleak'
1496 def _real_extract(self, url):
1497 mobj = re.match(self._VALID_URL, url)
1499 raise ExtractorError(u'Invalid URL: %s' % url)
1501 video_id = mobj.group('video_id')
1503 webpage = self._download_webpage(url, video_id)
1505 video_url = self._search_regex(r'file: "(.*?)",',
1506 webpage, u'video URL')
1508 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1509 webpage, u'title').replace('LiveLeak.com -', '').strip()
1511 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1512 webpage, u'description', fatal=False)
1514 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1515 webpage, u'uploader', fatal=False)
1521 'title': video_title,
1522 'description': video_description,
1523 'uploader': video_uploader
1530 class TumblrIE(InfoExtractor):
1531 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1533 def _real_extract(self, url):
1534 m_url = re.match(self._VALID_URL, url)
1535 video_id = m_url.group('id')
1536 blog = m_url.group('blog_name')
1538 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1539 webpage = self._download_webpage(url, video_id)
1541 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1542 video = re.search(re_video, webpage)
1544 raise ExtractorError(u'Unable to extract video')
1545 video_url = video.group('video_url')
1546 ext = video.group('ext')
1548 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1549 webpage, u'thumbnail', fatal=False) # We pick the first poster
1550 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1552 # The only place where you can get a title, it's not complete,
1553 # but searching in other places doesn't work for all videos
1554 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1555 webpage, u'title', flags=re.DOTALL)
1557 return [{'id': video_id,
1559 'title': video_title,
1560 'thumbnail': video_thumbnail,
1564 class BandcampIE(InfoExtractor):
1565 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1567 def _real_extract(self, url):
1568 mobj = re.match(self._VALID_URL, url)
1569 title = mobj.group('title')
1570 webpage = self._download_webpage(url, title)
1571 # We get the link to the free download page
1572 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1573 if m_download is None:
1574 raise ExtractorError(u'No free songs found')
1576 download_link = m_download.group(1)
1577 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1578 webpage, re.MULTILINE|re.DOTALL).group('id')
1580 download_webpage = self._download_webpage(download_link, id,
1581 'Downloading free downloads page')
1582 # We get the dictionary of the track from some javascrip code
1583 info = re.search(r'items: (.*?),$',
1584 download_webpage, re.MULTILINE).group(1)
1585 info = json.loads(info)[0]
1586 # We pick mp3-320 for now, until format selection can be easily implemented.
1587 mp3_info = info[u'downloads'][u'mp3-320']
1588 # If we try to use this url it says the link has expired
1589 initial_url = mp3_info[u'url']
1590 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1591 m_url = re.match(re_url, initial_url)
1592 #We build the url we will use to get the final track url
1593 # This url is build in Bandcamp in the script download_bunde_*.js
1594 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1595 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1596 # If we could correctly generate the .rand field the url would be
1597 #in the "download_url" key
1598 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1600 track_info = {'id':id,
1601 'title' : info[u'title'],
1604 'thumbnail' : info[u'thumb_url'],
1605 'uploader' : info[u'artist']
1610 class RedTubeIE(InfoExtractor):
1611 """Information Extractor for redtube"""
1612 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1614 def _real_extract(self,url):
1615 mobj = re.match(self._VALID_URL, url)
1617 raise ExtractorError(u'Invalid URL: %s' % url)
1619 video_id = mobj.group('id')
1620 video_extension = 'mp4'
1621 webpage = self._download_webpage(url, video_id)
1623 self.report_extraction(video_id)
1625 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1626 webpage, u'video URL')
1628 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1634 'ext': video_extension,
1635 'title': video_title,
1638 class InaIE(InfoExtractor):
1639 """Information Extractor for Ina.fr"""
1640 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1642 def _real_extract(self,url):
1643 mobj = re.match(self._VALID_URL, url)
1645 video_id = mobj.group('id')
1646 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1647 video_extension = 'mp4'
1648 webpage = self._download_webpage(mrss_url, video_id)
1650 self.report_extraction(video_id)
1652 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1653 webpage, u'video URL')
1655 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1661 'ext': video_extension,
1662 'title': video_title,
1665 class HowcastIE(InfoExtractor):
1666 """Information Extractor for Howcast.com"""
1667 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1669 def _real_extract(self, url):
1670 mobj = re.match(self._VALID_URL, url)
1672 video_id = mobj.group('id')
1673 webpage_url = 'http://www.howcast.com/videos/' + video_id
1674 webpage = self._download_webpage(webpage_url, video_id)
1676 self.report_extraction(video_id)
1678 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1679 webpage, u'video URL')
1681 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1684 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1685 webpage, u'description', fatal=False)
1687 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1688 webpage, u'thumbnail', fatal=False)
1694 'title': video_title,
1695 'description': video_description,
1696 'thumbnail': thumbnail,
1699 class VineIE(InfoExtractor):
1700 """Information Extractor for Vine.co"""
1701 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1703 def _real_extract(self, url):
1704 mobj = re.match(self._VALID_URL, url)
1706 video_id = mobj.group('id')
1707 webpage_url = 'https://vine.co/v/' + video_id
1708 webpage = self._download_webpage(webpage_url, video_id)
1710 self.report_extraction(video_id)
1712 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1713 webpage, u'video URL')
1715 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1718 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1719 webpage, u'thumbnail', fatal=False)
1721 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1722 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1728 'title': video_title,
1729 'thumbnail': thumbnail,
1730 'uploader': uploader,
1733 class FlickrIE(InfoExtractor):
1734 """Information Extractor for Flickr videos"""
1735 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1737 def _real_extract(self, url):
1738 mobj = re.match(self._VALID_URL, url)
1740 video_id = mobj.group('id')
1741 video_uploader_id = mobj.group('uploader_id')
1742 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1743 webpage = self._download_webpage(webpage_url, video_id)
1745 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1747 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1748 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1750 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1751 first_xml, u'node_id')
1753 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1754 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1756 self.report_extraction(video_id)
1758 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1760 raise ExtractorError(u'Unable to extract video url')
1761 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1763 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1764 webpage, u'video title')
1766 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1767 webpage, u'description', fatal=False)
1769 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1770 webpage, u'thumbnail', fatal=False)
1776 'title': video_title,
1777 'description': video_description,
1778 'thumbnail': thumbnail,
1779 'uploader_id': video_uploader_id,
1782 class TeamcocoIE(InfoExtractor):
1783 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1785 def _real_extract(self, url):
1786 mobj = re.match(self._VALID_URL, url)
1788 raise ExtractorError(u'Invalid URL: %s' % url)
1789 url_title = mobj.group('url_title')
1790 webpage = self._download_webpage(url, url_title)
1792 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1793 webpage, u'video id')
1795 self.report_extraction(video_id)
1797 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1800 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1801 webpage, u'thumbnail', fatal=False)
1803 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1804 webpage, u'description', fatal=False)
1806 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1807 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1809 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1816 'title': video_title,
1817 'thumbnail': thumbnail,
1818 'description': video_description,
1821 class XHamsterIE(InfoExtractor):
1822 """Information Extractor for xHamster"""
1823 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1825 def _real_extract(self,url):
1826 mobj = re.match(self._VALID_URL, url)
1828 video_id = mobj.group('id')
1829 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1830 webpage = self._download_webpage(mrss_url, video_id)
1832 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1834 raise ExtractorError(u'Unable to extract media URL')
1835 if len(mobj.group('server')) == 0:
1836 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1838 video_url = mobj.group('server')+'/key='+mobj.group('file')
1839 video_extension = video_url.split('.')[-1]
1841 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1844 # Can't see the description anywhere in the UI
1845 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1846 # webpage, u'description', fatal=False)
1847 # if video_description: video_description = unescapeHTML(video_description)
1849 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1851 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1853 video_upload_date = None
1854 self._downloader.report_warning(u'Unable to extract upload date')
1856 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1857 webpage, u'uploader id', default=u'anonymous')
1859 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1860 webpage, u'thumbnail', fatal=False)
1865 'ext': video_extension,
1866 'title': video_title,
1867 # 'description': video_description,
1868 'upload_date': video_upload_date,
1869 'uploader_id': video_uploader_id,
1870 'thumbnail': video_thumbnail
1873 class HypemIE(InfoExtractor):
1874 """Information Extractor for hypem"""
1875 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1877 def _real_extract(self, url):
1878 mobj = re.match(self._VALID_URL, url)
1880 raise ExtractorError(u'Invalid URL: %s' % url)
1881 track_id = mobj.group(1)
1883 data = { 'ax': 1, 'ts': time.time() }
1884 data_encoded = compat_urllib_parse.urlencode(data)
1885 complete_url = url + "?" + data_encoded
1886 request = compat_urllib_request.Request(complete_url)
1887 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1888 cookie = urlh.headers.get('Set-Cookie', '')
1890 self.report_extraction(track_id)
1892 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1893 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1895 track_list = json.loads(html_tracks)
1896 track = track_list[u'tracks'][0]
1898 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1901 track_id = track[u"id"]
1902 artist = track[u"artist"]
1903 title = track[u"song"]
1905 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1906 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1907 request.add_header('cookie', cookie)
1908 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1910 song_data = json.loads(song_data_json)
1912 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1913 final_url = song_data[u"url"]
1923 class Vbox7IE(InfoExtractor):
1924 """Information Extractor for Vbox7"""
1925 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1927 def _real_extract(self,url):
1928 mobj = re.match(self._VALID_URL, url)
1930 raise ExtractorError(u'Invalid URL: %s' % url)
1931 video_id = mobj.group(1)
1933 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1934 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1935 redirect_url = urlh.geturl() + new_location
1936 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1938 title = self._html_search_regex(r'<title>(.*)</title>',
1939 webpage, u'title').split('/')[0].strip()
1942 info_url = "http://vbox7.com/play/magare.do"
1943 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1944 info_request = compat_urllib_request.Request(info_url, data)
1945 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1946 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1947 if info_response is None:
1948 raise ExtractorError(u'Unable to extract the media url')
1949 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1956 'thumbnail': thumbnail_url,
1960 def gen_extractors():
1961 """ Return a list of an instance of every supported extractor.
1962 The order does matter; the first extractor matched is the one handling the URL.
1965 YoutubePlaylistIE(),
1990 StanfordOpenClassroomIE(),
2000 WorldStarHipHopIE(),
2030 def get_info_extractor(ie_name):
2031 """Returns the info extractor class with the given ie_name"""
2032 return globals()[ie_name+'IE']