10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.depositfiles import DepositFilesIE
27 from .extractor.escapist import EscapistIE
28 from .extractor.facebook import FacebookIE
29 from .extractor.gametrailers import GametrailersIE
30 from .extractor.generic import GenericIE
31 from .extractor.googleplus import GooglePlusIE
32 from .extractor.googlesearch import GoogleSearchIE
33 from .extractor.metacafe import MetacafeIE
34 from .extractor.myvideo import MyVideoIE
35 from .extractor.statigram import StatigramIE
36 from .extractor.photobucket import PhotobucketIE
37 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
38 from .extractor.vimeo import VimeoIE
39 from .extractor.yahoo import YahooIE, YahooSearchIE
40 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
41 from .extractor.zdf import ZDFIE
71 class CollegeHumorIE(InfoExtractor):
72 """Information extractor for collegehumor.com"""
75 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
76 IE_NAME = u'collegehumor'
78 def report_manifest(self, video_id):
79 """Report information extraction."""
80 self.to_screen(u'%s: Downloading XML manifest' % video_id)
82 def _real_extract(self, url):
83 mobj = re.match(self._VALID_URL, url)
85 raise ExtractorError(u'Invalid URL: %s' % url)
86 video_id = mobj.group('videoid')
94 self.report_extraction(video_id)
95 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
97 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
98 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
99 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
101 mdoc = xml.etree.ElementTree.fromstring(metaXml)
103 videoNode = mdoc.findall('./video')[0]
104 info['description'] = videoNode.findall('./description')[0].text
105 info['title'] = videoNode.findall('./caption')[0].text
106 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
107 manifest_url = videoNode.findall('./file')[0].text
109 raise ExtractorError(u'Invalid metadata XML file')
111 manifest_url += '?hdcore=2.10.3'
112 self.report_manifest(video_id)
114 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
115 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
116 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
118 adoc = xml.etree.ElementTree.fromstring(manifestXml)
120 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
121 node_id = media_node.attrib['url']
122 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
123 except IndexError as err:
124 raise ExtractorError(u'Invalid manifest file')
126 url_pr = compat_urllib_parse_urlparse(manifest_url)
127 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
134 class XVideosIE(InfoExtractor):
135 """Information extractor for xvideos.com"""
137 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
140 def _real_extract(self, url):
141 mobj = re.match(self._VALID_URL, url)
143 raise ExtractorError(u'Invalid URL: %s' % url)
144 video_id = mobj.group(1)
146 webpage = self._download_webpage(url, video_id)
148 self.report_extraction(video_id)
151 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
152 webpage, u'video URL'))
155 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
158 # Extract video thumbnail
159 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
160 webpage, u'thumbnail', fatal=False)
167 'title': video_title,
169 'thumbnail': video_thumbnail,
178 class InfoQIE(InfoExtractor):
179 """Information extractor for infoq.com"""
180 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
182 def _real_extract(self, url):
183 mobj = re.match(self._VALID_URL, url)
185 raise ExtractorError(u'Invalid URL: %s' % url)
187 webpage = self._download_webpage(url, video_id=url)
188 self.report_extraction(url)
191 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
193 raise ExtractorError(u'Unable to extract video url')
194 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
195 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
198 video_title = self._search_regex(r'contentTitle = "(.*?)";',
201 # Extract description
202 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
203 webpage, u'description', fatal=False)
205 video_filename = video_url.split('/')[-1]
206 video_id, extension = video_filename.split('.')
213 'title': video_title,
214 'ext': extension, # Extension is always(?) mp4, but seems to be flv
216 'description': video_description,
221 class MixcloudIE(InfoExtractor):
222 """Information extractor for www.mixcloud.com"""
224 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
225 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
226 IE_NAME = u'mixcloud'
228 def report_download_json(self, file_id):
229 """Report JSON download."""
230 self.to_screen(u'Downloading json')
232 def get_urls(self, jsonData, fmt, bitrate='best'):
233 """Get urls from 'audio_formats' section in json"""
236 bitrate_list = jsonData[fmt]
237 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
238 bitrate = max(bitrate_list) # select highest
240 url_list = jsonData[fmt][bitrate]
241 except TypeError: # we have no bitrate info.
242 url_list = jsonData[fmt]
245 def check_urls(self, url_list):
246 """Returns 1st active url from list"""
249 compat_urllib_request.urlopen(url)
251 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
256 def _print_formats(self, formats):
257 print('Available formats:')
258 for fmt in formats.keys():
259 for b in formats[fmt]:
261 ext = formats[fmt][b][0]
262 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
263 except TypeError: # we have no bitrate info
264 ext = formats[fmt][0]
265 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
268 def _real_extract(self, url):
269 mobj = re.match(self._VALID_URL, url)
271 raise ExtractorError(u'Invalid URL: %s' % url)
272 # extract uploader & filename from url
273 uploader = mobj.group(1).decode('utf-8')
274 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
276 # construct API request
277 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
278 # retrieve .json file with links to files
279 request = compat_urllib_request.Request(file_url)
281 self.report_download_json(file_url)
282 jsonData = compat_urllib_request.urlopen(request).read()
283 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
284 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
287 json_data = json.loads(jsonData)
288 player_url = json_data['player_swf_url']
289 formats = dict(json_data['audio_formats'])
291 req_format = self._downloader.params.get('format', None)
294 if self._downloader.params.get('listformats', None):
295 self._print_formats(formats)
298 if req_format is None or req_format == 'best':
299 for format_param in formats.keys():
300 url_list = self.get_urls(formats, format_param)
302 file_url = self.check_urls(url_list)
303 if file_url is not None:
306 if req_format not in formats:
307 raise ExtractorError(u'Format is not available')
309 url_list = self.get_urls(formats, req_format)
310 file_url = self.check_urls(url_list)
311 format_param = req_format
314 'id': file_id.decode('utf-8'),
315 'url': file_url.decode('utf-8'),
316 'uploader': uploader.decode('utf-8'),
318 'title': json_data['name'],
319 'ext': file_url.split('.')[-1].decode('utf-8'),
320 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
321 'thumbnail': json_data['thumbnail_url'],
322 'description': json_data['description'],
323 'player_url': player_url.decode('utf-8'),
326 class StanfordOpenClassroomIE(InfoExtractor):
327 """Information extractor for Stanford's Open ClassRoom"""
329 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
330 IE_NAME = u'stanfordoc'
332 def _real_extract(self, url):
333 mobj = re.match(self._VALID_URL, url)
335 raise ExtractorError(u'Invalid URL: %s' % url)
337 if mobj.group('course') and mobj.group('video'): # A specific video
338 course = mobj.group('course')
339 video = mobj.group('video')
341 'id': course + '_' + video,
346 self.report_extraction(info['id'])
347 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
348 xmlUrl = baseUrl + video + '.xml'
350 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
351 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
352 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
353 mdoc = xml.etree.ElementTree.fromstring(metaXml)
355 info['title'] = mdoc.findall('./title')[0].text
356 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
358 raise ExtractorError(u'Invalid metadata XML file')
359 info['ext'] = info['url'].rpartition('.')[2]
361 elif mobj.group('course'): # A course page
362 course = mobj.group('course')
370 coursepage = self._download_webpage(url, info['id'],
371 note='Downloading course info page',
372 errnote='Unable to download course info page')
374 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
376 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
377 coursepage, u'description', fatal=False)
379 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
383 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
387 for entry in info['list']:
388 assert entry['type'] == 'reference'
389 results += self.extract(entry['url'])
393 'id': 'Stanford OpenClassroom',
399 self.report_download_webpage(info['id'])
400 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
402 rootpage = compat_urllib_request.urlopen(rootURL).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
406 info['title'] = info['id']
408 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
412 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
417 for entry in info['list']:
418 assert entry['type'] == 'reference'
419 results += self.extract(entry['url'])
422 class MTVIE(InfoExtractor):
423 """Information extractor for MTV.com"""
425 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
428 def _real_extract(self, url):
429 mobj = re.match(self._VALID_URL, url)
431 raise ExtractorError(u'Invalid URL: %s' % url)
432 if not mobj.group('proto'):
433 url = 'http://' + url
434 video_id = mobj.group('videoid')
436 webpage = self._download_webpage(url, video_id)
438 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
439 webpage, u'song name', fatal=False)
441 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
444 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
445 webpage, u'mtvn_uri', fatal=False)
447 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
448 webpage, u'content id', fatal=False)
450 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
451 self.report_extraction(video_id)
452 request = compat_urllib_request.Request(videogen_url)
454 metadataXml = compat_urllib_request.urlopen(request).read()
455 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
456 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
458 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
459 renditions = mdoc.findall('.//rendition')
461 # For now, always pick the highest quality.
462 rendition = renditions[-1]
465 _,_,ext = rendition.attrib['type'].partition('/')
466 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
467 video_url = rendition.find('./src').text
469 raise ExtractorError('Invalid rendition field.')
474 'uploader': performer,
476 'title': video_title,
484 class YoukuIE(InfoExtractor):
485 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
488 nowTime = int(time.time() * 1000)
489 random1 = random.randint(1000,1998)
490 random2 = random.randint(1000,9999)
492 return "%d%d%d" %(nowTime,random1,random2)
494 def _get_file_ID_mix_string(self, seed):
496 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
498 for i in range(len(source)):
499 seed = (seed * 211 + 30031 ) % 65536
500 index = math.floor(seed / 65536 * len(source) )
501 mixed.append(source[int(index)])
502 source.remove(source[int(index)])
503 #return ''.join(mixed)
506 def _get_file_id(self, fileId, seed):
507 mixed = self._get_file_ID_mix_string(seed)
508 ids = fileId.split('*')
512 realId.append(mixed[int(ch)])
513 return ''.join(realId)
515 def _real_extract(self, url):
516 mobj = re.match(self._VALID_URL, url)
518 raise ExtractorError(u'Invalid URL: %s' % url)
519 video_id = mobj.group('ID')
521 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
523 jsondata = self._download_webpage(info_url, video_id)
525 self.report_extraction(video_id)
527 config = json.loads(jsondata)
529 video_title = config['data'][0]['title']
530 seed = config['data'][0]['seed']
532 format = self._downloader.params.get('format', None)
533 supported_format = list(config['data'][0]['streamfileids'].keys())
535 if format is None or format == 'best':
536 if 'hd2' in supported_format:
541 elif format == 'worst':
549 fileid = config['data'][0]['streamfileids'][format]
550 keys = [s['k'] for s in config['data'][0]['segs'][format]]
551 except (UnicodeDecodeError, ValueError, KeyError):
552 raise ExtractorError(u'Unable to extract info section')
555 sid = self._gen_sid()
556 fileid = self._get_file_id(fileid, seed)
558 #column 8,9 of fileid represent the segment number
559 #fileid[7:9] should be changed
560 for index, key in enumerate(keys):
562 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
563 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
566 'id': '%s_part%02d' % (video_id, index),
570 'title': video_title,
573 files_info.append(info)
578 class XNXXIE(InfoExtractor):
579 """Information extractor for xnxx.com"""
581 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
583 VIDEO_URL_RE = r'flv_url=(.*?)&'
584 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
585 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
587 def _real_extract(self, url):
588 mobj = re.match(self._VALID_URL, url)
590 raise ExtractorError(u'Invalid URL: %s' % url)
591 video_id = mobj.group(1)
593 # Get webpage content
594 webpage = self._download_webpage(url, video_id)
596 video_url = self._search_regex(self.VIDEO_URL_RE,
597 webpage, u'video URL')
598 video_url = compat_urllib_parse.unquote(video_url)
600 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
603 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
604 webpage, u'thumbnail', fatal=False)
611 'title': video_title,
613 'thumbnail': video_thumbnail,
619 class NBAIE(InfoExtractor):
620 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
623 def _real_extract(self, url):
624 mobj = re.match(self._VALID_URL, url)
626 raise ExtractorError(u'Invalid URL: %s' % url)
628 video_id = mobj.group(1)
630 webpage = self._download_webpage(url, video_id)
632 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
634 shortened_video_id = video_id.rpartition('/')[2]
635 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
636 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
638 # It isn't there in the HTML it returns to us
639 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
641 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
644 'id': shortened_video_id,
648 # 'uploader_date': uploader_date,
649 'description': description,
653 class JustinTVIE(InfoExtractor):
654 """Information extractor for justin.tv and twitch.tv"""
655 # TODO: One broadcast may be split into multiple videos. The key
656 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
657 # starts at 1 and increases. Can we treat all parts as one video?
659 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
661 (?P<channelid>[^/]+)|
662 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
663 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
667 _JUSTIN_PAGE_LIMIT = 100
668 IE_NAME = u'justin.tv'
670 def report_download_page(self, channel, offset):
671 """Report attempt to download a single page of videos."""
672 self.to_screen(u'%s: Downloading video information from %d to %d' %
673 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
675 # Return count of items, list of *valid* items
676 def _parse_page(self, url, video_id):
677 webpage = self._download_webpage(url, video_id,
678 u'Downloading video info JSON',
679 u'unable to download video info JSON')
681 response = json.loads(webpage)
682 if type(response) != list:
683 error_text = response.get('error', 'unknown error')
684 raise ExtractorError(u'Justin.tv API: %s' % error_text)
686 for clip in response:
687 video_url = clip['video_file_url']
689 video_extension = os.path.splitext(video_url)[1][1:]
690 video_date = re.sub('-', '', clip['start_time'][:10])
691 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
692 video_id = clip['id']
693 video_title = clip.get('title', video_id)
697 'title': video_title,
698 'uploader': clip.get('channel_name', video_uploader_id),
699 'uploader_id': video_uploader_id,
700 'upload_date': video_date,
701 'ext': video_extension,
703 return (len(response), info)
705 def _real_extract(self, url):
706 mobj = re.match(self._VALID_URL, url)
708 raise ExtractorError(u'invalid URL: %s' % url)
710 api_base = 'http://api.justin.tv'
712 if mobj.group('channelid'):
714 video_id = mobj.group('channelid')
715 api = api_base + '/channel/archives/%s.json' % video_id
716 elif mobj.group('chapterid'):
717 chapter_id = mobj.group('chapterid')
719 webpage = self._download_webpage(url, chapter_id)
720 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
722 raise ExtractorError(u'Cannot find archive of a chapter')
723 archive_id = m.group(1)
725 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
726 chapter_info_xml = self._download_webpage(api, chapter_id,
727 note=u'Downloading chapter information',
728 errnote=u'Chapter information download failed')
729 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
730 for a in doc.findall('.//archive'):
731 if archive_id == a.find('./id').text:
734 raise ExtractorError(u'Could not find chapter in chapter information')
736 video_url = a.find('./video_file_url').text
737 video_ext = video_url.rpartition('.')[2] or u'flv'
739 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
740 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
741 note='Downloading chapter metadata',
742 errnote='Download of chapter metadata failed')
743 chapter_info = json.loads(chapter_info_json)
745 bracket_start = int(doc.find('.//bracket_start').text)
746 bracket_end = int(doc.find('.//bracket_end').text)
748 # TODO determine start (and probably fix up file)
749 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
750 #video_url += u'?start=' + TODO:start_timestamp
751 # bracket_start is 13290, but we want 51670615
752 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
753 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
756 'id': u'c' + chapter_id,
759 'title': chapter_info['title'],
760 'thumbnail': chapter_info['preview'],
761 'description': chapter_info['description'],
762 'uploader': chapter_info['channel']['display_name'],
763 'uploader_id': chapter_info['channel']['name'],
767 video_id = mobj.group('videoid')
768 api = api_base + '/broadcast/by_archive/%s.json' % video_id
770 self.report_extraction(video_id)
774 limit = self._JUSTIN_PAGE_LIMIT
777 self.report_download_page(video_id, offset)
778 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
779 page_count, page_info = self._parse_page(page_url, video_id)
780 info.extend(page_info)
781 if not paged or page_count != limit:
786 class FunnyOrDieIE(InfoExtractor):
787 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
789 def _real_extract(self, url):
790 mobj = re.match(self._VALID_URL, url)
792 raise ExtractorError(u'invalid URL: %s' % url)
794 video_id = mobj.group('id')
795 webpage = self._download_webpage(url, video_id)
797 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
798 webpage, u'video URL', flags=re.DOTALL)
800 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
801 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
803 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
804 webpage, u'description', fatal=False, flags=re.DOTALL)
811 'description': video_description,
815 class SteamIE(InfoExtractor):
816 _VALID_URL = r"""http://store\.steampowered\.com/
818 (?P<urltype>video|app)/ #If the page is only for videos or for a game
820 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
822 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
823 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
826 def suitable(cls, url):
827 """Receives a URL and returns True if suitable for this IE."""
828 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
830 def _real_extract(self, url):
831 m = re.match(self._VALID_URL, url, re.VERBOSE)
832 gameID = m.group('gameID')
834 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
835 webpage = self._download_webpage(videourl, gameID)
837 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
838 videourl = self._AGECHECK_TEMPLATE % gameID
839 self.report_age_confirmation()
840 webpage = self._download_webpage(videourl, gameID)
842 self.report_extraction(gameID)
843 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
844 webpage, 'game title')
846 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
847 mweb = re.finditer(urlRE, webpage)
848 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
849 titles = re.finditer(namesRE, webpage)
850 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
851 thumbs = re.finditer(thumbsRE, webpage)
853 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
854 video_id = vid.group('videoID')
855 title = vtitle.group('videoName')
856 video_url = vid.group('videoURL')
857 video_thumb = thumb.group('thumbnail')
859 raise ExtractorError(u'Cannot find video url for %s' % video_id)
864 'title': unescapeHTML(title),
865 'thumbnail': video_thumb
868 return [self.playlist_result(videos, gameID, game_title)]
870 class UstreamIE(InfoExtractor):
871 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
874 def _real_extract(self, url):
875 m = re.match(self._VALID_URL, url)
876 video_id = m.group('videoID')
878 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
879 webpage = self._download_webpage(url, video_id)
881 self.report_extraction(video_id)
883 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
886 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
887 webpage, u'uploader', fatal=False, flags=re.DOTALL)
889 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
890 webpage, u'thumbnail', fatal=False)
896 'title': video_title,
897 'uploader': uploader,
898 'thumbnail': thumbnail,
902 class WorldStarHipHopIE(InfoExtractor):
903 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
904 IE_NAME = u'WorldStarHipHop'
906 def _real_extract(self, url):
907 m = re.match(self._VALID_URL, url)
908 video_id = m.group('id')
910 webpage_src = self._download_webpage(url, video_id)
912 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
913 webpage_src, u'video URL')
915 if 'mp4' in video_url:
920 video_title = self._html_search_regex(r"<title>(.*)</title>",
921 webpage_src, u'title')
923 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
924 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
925 webpage_src, u'thumbnail', fatal=False)
928 _title = r"""candytitles.*>(.*)</span>"""
929 mobj = re.search(_title, webpage_src)
931 video_title = mobj.group(1)
936 'title' : video_title,
937 'thumbnail' : thumbnail,
942 class RBMARadioIE(InfoExtractor):
943 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
945 def _real_extract(self, url):
946 m = re.match(self._VALID_URL, url)
947 video_id = m.group('videoID')
949 webpage = self._download_webpage(url, video_id)
951 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
952 webpage, u'json data', flags=re.MULTILINE)
955 data = json.loads(json_data)
956 except ValueError as e:
957 raise ExtractorError(u'Invalid JSON: ' + str(e))
959 video_url = data['akamai_url'] + '&cbr=256'
960 url_parts = compat_urllib_parse_urlparse(video_url)
961 video_ext = url_parts.path.rpartition('.')[2]
966 'title': data['title'],
967 'description': data.get('teaser_text'),
968 'location': data.get('country_of_origin'),
969 'uploader': data.get('host', {}).get('name'),
970 'uploader_id': data.get('host', {}).get('slug'),
971 'thumbnail': data.get('image', {}).get('large_url_2x'),
972 'duration': data.get('duration'),
977 class YouPornIE(InfoExtractor):
978 """Information extractor for youporn.com."""
979 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
981 def _print_formats(self, formats):
982 """Print all available formats"""
983 print(u'Available formats:')
984 print(u'ext\t\tformat')
985 print(u'---------------------------------')
986 for format in formats:
987 print(u'%s\t\t%s' % (format['ext'], format['format']))
989 def _specific(self, req_format, formats):
991 if(x["format"]==req_format):
995 def _real_extract(self, url):
996 mobj = re.match(self._VALID_URL, url)
998 raise ExtractorError(u'Invalid URL: %s' % url)
999 video_id = mobj.group('videoid')
1001 req = compat_urllib_request.Request(url)
1002 req.add_header('Cookie', 'age_verified=1')
1003 webpage = self._download_webpage(req, video_id)
1005 # Get JSON parameters
1006 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1008 params = json.loads(json_params)
1010 raise ExtractorError(u'Invalid JSON')
1012 self.report_extraction(video_id)
1014 video_title = params['title']
1015 upload_date = unified_strdate(params['release_date_f'])
1016 video_description = params['description']
1017 video_uploader = params['submitted_by']
1018 thumbnail = params['thumbnails'][0]['image']
1020 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1022 # Get all of the formats available
1023 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1024 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1025 webpage, u'download list').strip()
1027 # Get all of the links from the page
1028 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1029 links = re.findall(LINK_RE, download_list_html)
1030 if(len(links) == 0):
1031 raise ExtractorError(u'ERROR: no known formats available for video')
1033 self.to_screen(u'Links found: %d' % len(links))
1038 # A link looks like this:
1039 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1040 # A path looks like this:
1041 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1042 video_url = unescapeHTML( link )
1043 path = compat_urllib_parse_urlparse( video_url ).path
1044 extension = os.path.splitext( path )[1][1:]
1045 format = path.split('/')[4].split('_')[:2]
1048 format = "-".join( format )
1049 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1054 'uploader': video_uploader,
1055 'upload_date': upload_date,
1056 'title': video_title,
1059 'thumbnail': thumbnail,
1060 'description': video_description
1063 if self._downloader.params.get('listformats', None):
1064 self._print_formats(formats)
1067 req_format = self._downloader.params.get('format', None)
1068 self.to_screen(u'Format: %s' % req_format)
1070 if req_format is None or req_format == 'best':
1072 elif req_format == 'worst':
1073 return [formats[-1]]
1074 elif req_format in ('-1', 'all'):
1077 format = self._specific( req_format, formats )
1079 raise ExtractorError(u'Requested format not available')
1084 class PornotubeIE(InfoExtractor):
1085 """Information extractor for pornotube.com."""
1086 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1088 def _real_extract(self, url):
1089 mobj = re.match(self._VALID_URL, url)
1091 raise ExtractorError(u'Invalid URL: %s' % url)
1093 video_id = mobj.group('videoid')
1094 video_title = mobj.group('title')
1096 # Get webpage content
1097 webpage = self._download_webpage(url, video_id)
1100 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1101 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1102 video_url = compat_urllib_parse.unquote(video_url)
1104 #Get the uploaded date
1105 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1106 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1107 if upload_date: upload_date = unified_strdate(upload_date)
1109 info = {'id': video_id,
1112 'upload_date': upload_date,
1113 'title': video_title,
1119 class YouJizzIE(InfoExtractor):
1120 """Information extractor for youjizz.com."""
1121 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1123 def _real_extract(self, url):
1124 mobj = re.match(self._VALID_URL, url)
1126 raise ExtractorError(u'Invalid URL: %s' % url)
1128 video_id = mobj.group('videoid')
1130 # Get webpage content
1131 webpage = self._download_webpage(url, video_id)
1133 # Get the video title
1134 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1135 webpage, u'title').strip()
1137 # Get the embed page
1138 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1140 raise ExtractorError(u'ERROR: unable to extract embed page')
1142 embed_page_url = result.group(0).strip()
1143 video_id = result.group('videoid')
1145 webpage = self._download_webpage(embed_page_url, video_id)
1148 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1149 webpage, u'video URL')
1151 info = {'id': video_id,
1153 'title': video_title,
1156 'player_url': embed_page_url}
1160 class EightTracksIE(InfoExtractor):
1162 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1164 def _real_extract(self, url):
1165 mobj = re.match(self._VALID_URL, url)
1167 raise ExtractorError(u'Invalid URL: %s' % url)
1168 playlist_id = mobj.group('id')
1170 webpage = self._download_webpage(url, playlist_id)
1172 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1173 data = json.loads(json_like)
1175 session = str(random.randint(0, 1000000000))
1177 track_count = data['tracks_count']
1178 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1179 next_url = first_url
1181 for i in itertools.count():
1182 api_json = self._download_webpage(next_url, playlist_id,
1183 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1184 errnote=u'Failed to download song information')
1185 api_data = json.loads(api_json)
1186 track_data = api_data[u'set']['track']
1188 'id': track_data['id'],
1189 'url': track_data['track_file_stream_url'],
1190 'title': track_data['performer'] + u' - ' + track_data['name'],
1191 'raw_title': track_data['name'],
1192 'uploader_id': data['user']['login'],
1196 if api_data['set']['at_last_track']:
1198 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1201 class KeekIE(InfoExtractor):
1202 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1205 def _real_extract(self, url):
1206 m = re.match(self._VALID_URL, url)
1207 video_id = m.group('videoID')
1209 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1210 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1211 webpage = self._download_webpage(url, video_id)
1213 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1216 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1217 webpage, u'uploader', fatal=False)
1223 'title': video_title,
1224 'thumbnail': thumbnail,
1225 'uploader': uploader
1229 class TEDIE(InfoExtractor):
1230 _VALID_URL=r'''http://www\.ted\.com/
1232 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1234 ((?P<type_talk>talks)) # We have a simple talk
1236 (/lang/(.*?))? # The url may contain the language
1237 /(?P<name>\w+) # Here goes the name and then ".html"
1241 def suitable(cls, url):
1242 """Receives a URL and returns True if suitable for this IE."""
1243 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1245 def _real_extract(self, url):
1246 m=re.match(self._VALID_URL, url, re.VERBOSE)
1247 if m.group('type_talk'):
1248 return [self._talk_info(url)]
1250 playlist_id=m.group('playlist_id')
1251 name=m.group('name')
1252 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1253 return [self._playlist_videos_info(url,name,playlist_id)]
1255 def _playlist_videos_info(self,url,name,playlist_id=0):
1256 '''Returns the videos of the playlist'''
1258 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1259 ([.\s]*?)data-playlist_item_id="(\d+)"
1260 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1262 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1263 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1264 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1265 m_names=re.finditer(video_name_RE,webpage)
1267 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1268 webpage, 'playlist title')
1270 playlist_entries = []
1271 for m_video, m_name in zip(m_videos,m_names):
1272 video_id=m_video.group('video_id')
1273 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1274 playlist_entries.append(self.url_result(talk_url, 'TED'))
1275 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1277 def _talk_info(self, url, video_id=0):
1278 """Return the video for the talk in the url"""
1279 m = re.match(self._VALID_URL, url,re.VERBOSE)
1280 video_name = m.group('name')
1281 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1282 self.report_extraction(video_name)
1283 # If the url includes the language we get the title translated
1284 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1286 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1287 webpage, 'json data')
1288 info = json.loads(json_data)
1289 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1290 webpage, 'description', flags = re.DOTALL)
1292 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1293 webpage, 'thumbnail')
1296 'url': info['htmlStreams'][-1]['file'],
1299 'thumbnail': thumbnail,
1300 'description': desc,
1304 class MySpassIE(InfoExtractor):
1305 _VALID_URL = r'http://www.myspass.de/.*'
1307 def _real_extract(self, url):
1308 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1310 # video id is the last path element of the URL
1311 # usually there is a trailing slash, so also try the second but last
1312 url_path = compat_urllib_parse_urlparse(url).path
1313 url_parent_path, video_id = os.path.split(url_path)
1315 _, video_id = os.path.split(url_parent_path)
1318 metadata_url = META_DATA_URL_TEMPLATE % video_id
1319 metadata_text = self._download_webpage(metadata_url, video_id)
1320 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1322 # extract values from metadata
1323 url_flv_el = metadata.find('url_flv')
1324 if url_flv_el is None:
1325 raise ExtractorError(u'Unable to extract download url')
1326 video_url = url_flv_el.text
1327 extension = os.path.splitext(video_url)[1][1:]
1328 title_el = metadata.find('title')
1329 if title_el is None:
1330 raise ExtractorError(u'Unable to extract title')
1331 title = title_el.text
1332 format_id_el = metadata.find('format_id')
1333 if format_id_el is None:
1336 format = format_id_el.text
1337 description_el = metadata.find('description')
1338 if description_el is not None:
1339 description = description_el.text
1342 imagePreview_el = metadata.find('imagePreview')
1343 if imagePreview_el is not None:
1344 thumbnail = imagePreview_el.text
1353 'thumbnail': thumbnail,
1354 'description': description
1358 class SpiegelIE(InfoExtractor):
1359 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1361 def _real_extract(self, url):
1362 m = re.match(self._VALID_URL, url)
1363 video_id = m.group('videoID')
1365 webpage = self._download_webpage(url, video_id)
1367 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1370 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1371 xml_code = self._download_webpage(xml_url, video_id,
1372 note=u'Downloading XML', errnote=u'Failed to download XML')
1374 idoc = xml.etree.ElementTree.fromstring(xml_code)
1375 last_type = idoc[-1]
1376 filename = last_type.findall('./filename')[0].text
1377 duration = float(last_type.findall('./duration')[0].text)
1379 video_url = 'http://video2.spiegel.de/flash/' + filename
1380 video_ext = filename.rpartition('.')[2]
1385 'title': video_title,
1386 'duration': duration,
1390 class LiveLeakIE(InfoExtractor):
1392 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1393 IE_NAME = u'liveleak'
1395 def _real_extract(self, url):
1396 mobj = re.match(self._VALID_URL, url)
1398 raise ExtractorError(u'Invalid URL: %s' % url)
1400 video_id = mobj.group('video_id')
1402 webpage = self._download_webpage(url, video_id)
1404 video_url = self._search_regex(r'file: "(.*?)",',
1405 webpage, u'video URL')
1407 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1408 webpage, u'title').replace('LiveLeak.com -', '').strip()
1410 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1411 webpage, u'description', fatal=False)
1413 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1414 webpage, u'uploader', fatal=False)
1420 'title': video_title,
1421 'description': video_description,
1422 'uploader': video_uploader
1429 class TumblrIE(InfoExtractor):
1430 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1432 def _real_extract(self, url):
1433 m_url = re.match(self._VALID_URL, url)
1434 video_id = m_url.group('id')
1435 blog = m_url.group('blog_name')
1437 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1438 webpage = self._download_webpage(url, video_id)
1440 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1441 video = re.search(re_video, webpage)
1443 raise ExtractorError(u'Unable to extract video')
1444 video_url = video.group('video_url')
1445 ext = video.group('ext')
1447 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1448 webpage, u'thumbnail', fatal=False) # We pick the first poster
1449 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1451 # The only place where you can get a title, it's not complete,
1452 # but searching in other places doesn't work for all videos
1453 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1454 webpage, u'title', flags=re.DOTALL)
1456 return [{'id': video_id,
1458 'title': video_title,
1459 'thumbnail': video_thumbnail,
1463 class BandcampIE(InfoExtractor):
1464 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1466 def _real_extract(self, url):
1467 mobj = re.match(self._VALID_URL, url)
1468 title = mobj.group('title')
1469 webpage = self._download_webpage(url, title)
1470 # We get the link to the free download page
1471 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1472 if m_download is None:
1473 raise ExtractorError(u'No free songs found')
1475 download_link = m_download.group(1)
1476 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1477 webpage, re.MULTILINE|re.DOTALL).group('id')
1479 download_webpage = self._download_webpage(download_link, id,
1480 'Downloading free downloads page')
1481 # We get the dictionary of the track from some javascrip code
1482 info = re.search(r'items: (.*?),$',
1483 download_webpage, re.MULTILINE).group(1)
1484 info = json.loads(info)[0]
1485 # We pick mp3-320 for now, until format selection can be easily implemented.
1486 mp3_info = info[u'downloads'][u'mp3-320']
1487 # If we try to use this url it says the link has expired
1488 initial_url = mp3_info[u'url']
1489 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1490 m_url = re.match(re_url, initial_url)
1491 #We build the url we will use to get the final track url
1492 # This url is build in Bandcamp in the script download_bunde_*.js
1493 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1494 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1495 # If we could correctly generate the .rand field the url would be
1496 #in the "download_url" key
1497 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1499 track_info = {'id':id,
1500 'title' : info[u'title'],
1503 'thumbnail' : info[u'thumb_url'],
1504 'uploader' : info[u'artist']
1509 class RedTubeIE(InfoExtractor):
1510 """Information Extractor for redtube"""
1511 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1513 def _real_extract(self,url):
1514 mobj = re.match(self._VALID_URL, url)
1516 raise ExtractorError(u'Invalid URL: %s' % url)
1518 video_id = mobj.group('id')
1519 video_extension = 'mp4'
1520 webpage = self._download_webpage(url, video_id)
1522 self.report_extraction(video_id)
1524 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1525 webpage, u'video URL')
1527 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1533 'ext': video_extension,
1534 'title': video_title,
1537 class InaIE(InfoExtractor):
1538 """Information Extractor for Ina.fr"""
1539 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1541 def _real_extract(self,url):
1542 mobj = re.match(self._VALID_URL, url)
1544 video_id = mobj.group('id')
1545 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1546 video_extension = 'mp4'
1547 webpage = self._download_webpage(mrss_url, video_id)
1549 self.report_extraction(video_id)
1551 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1552 webpage, u'video URL')
1554 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1560 'ext': video_extension,
1561 'title': video_title,
1564 class HowcastIE(InfoExtractor):
1565 """Information Extractor for Howcast.com"""
1566 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1568 def _real_extract(self, url):
1569 mobj = re.match(self._VALID_URL, url)
1571 video_id = mobj.group('id')
1572 webpage_url = 'http://www.howcast.com/videos/' + video_id
1573 webpage = self._download_webpage(webpage_url, video_id)
1575 self.report_extraction(video_id)
1577 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1578 webpage, u'video URL')
1580 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1583 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1584 webpage, u'description', fatal=False)
1586 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1587 webpage, u'thumbnail', fatal=False)
1593 'title': video_title,
1594 'description': video_description,
1595 'thumbnail': thumbnail,
1598 class VineIE(InfoExtractor):
1599 """Information Extractor for Vine.co"""
1600 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1602 def _real_extract(self, url):
1603 mobj = re.match(self._VALID_URL, url)
1605 video_id = mobj.group('id')
1606 webpage_url = 'https://vine.co/v/' + video_id
1607 webpage = self._download_webpage(webpage_url, video_id)
1609 self.report_extraction(video_id)
1611 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1612 webpage, u'video URL')
1614 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1617 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1618 webpage, u'thumbnail', fatal=False)
1620 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1621 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1627 'title': video_title,
1628 'thumbnail': thumbnail,
1629 'uploader': uploader,
1632 class FlickrIE(InfoExtractor):
1633 """Information Extractor for Flickr videos"""
1634 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1636 def _real_extract(self, url):
1637 mobj = re.match(self._VALID_URL, url)
1639 video_id = mobj.group('id')
1640 video_uploader_id = mobj.group('uploader_id')
1641 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1642 webpage = self._download_webpage(webpage_url, video_id)
1644 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1646 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1647 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1649 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1650 first_xml, u'node_id')
1652 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1653 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1655 self.report_extraction(video_id)
1657 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1659 raise ExtractorError(u'Unable to extract video url')
1660 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1662 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1663 webpage, u'video title')
1665 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1666 webpage, u'description', fatal=False)
1668 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1669 webpage, u'thumbnail', fatal=False)
1675 'title': video_title,
1676 'description': video_description,
1677 'thumbnail': thumbnail,
1678 'uploader_id': video_uploader_id,
1681 class TeamcocoIE(InfoExtractor):
1682 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1684 def _real_extract(self, url):
1685 mobj = re.match(self._VALID_URL, url)
1687 raise ExtractorError(u'Invalid URL: %s' % url)
1688 url_title = mobj.group('url_title')
1689 webpage = self._download_webpage(url, url_title)
1691 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1692 webpage, u'video id')
1694 self.report_extraction(video_id)
1696 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1699 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1700 webpage, u'thumbnail', fatal=False)
1702 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1703 webpage, u'description', fatal=False)
1705 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1706 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1708 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1715 'title': video_title,
1716 'thumbnail': thumbnail,
1717 'description': video_description,
1720 class XHamsterIE(InfoExtractor):
1721 """Information Extractor for xHamster"""
1722 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1724 def _real_extract(self,url):
1725 mobj = re.match(self._VALID_URL, url)
1727 video_id = mobj.group('id')
1728 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1729 webpage = self._download_webpage(mrss_url, video_id)
1731 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1733 raise ExtractorError(u'Unable to extract media URL')
1734 if len(mobj.group('server')) == 0:
1735 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1737 video_url = mobj.group('server')+'/key='+mobj.group('file')
1738 video_extension = video_url.split('.')[-1]
1740 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1743 # Can't see the description anywhere in the UI
1744 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1745 # webpage, u'description', fatal=False)
1746 # if video_description: video_description = unescapeHTML(video_description)
1748 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1750 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1752 video_upload_date = None
1753 self._downloader.report_warning(u'Unable to extract upload date')
1755 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1756 webpage, u'uploader id', default=u'anonymous')
1758 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1759 webpage, u'thumbnail', fatal=False)
1764 'ext': video_extension,
1765 'title': video_title,
1766 # 'description': video_description,
1767 'upload_date': video_upload_date,
1768 'uploader_id': video_uploader_id,
1769 'thumbnail': video_thumbnail
1772 class HypemIE(InfoExtractor):
1773 """Information Extractor for hypem"""
1774 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1776 def _real_extract(self, url):
1777 mobj = re.match(self._VALID_URL, url)
1779 raise ExtractorError(u'Invalid URL: %s' % url)
1780 track_id = mobj.group(1)
1782 data = { 'ax': 1, 'ts': time.time() }
1783 data_encoded = compat_urllib_parse.urlencode(data)
1784 complete_url = url + "?" + data_encoded
1785 request = compat_urllib_request.Request(complete_url)
1786 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1787 cookie = urlh.headers.get('Set-Cookie', '')
1789 self.report_extraction(track_id)
1791 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1792 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1794 track_list = json.loads(html_tracks)
1795 track = track_list[u'tracks'][0]
1797 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1800 track_id = track[u"id"]
1801 artist = track[u"artist"]
1802 title = track[u"song"]
1804 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1805 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1806 request.add_header('cookie', cookie)
1807 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1809 song_data = json.loads(song_data_json)
1811 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1812 final_url = song_data[u"url"]
1822 class Vbox7IE(InfoExtractor):
1823 """Information Extractor for Vbox7"""
1824 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1826 def _real_extract(self,url):
1827 mobj = re.match(self._VALID_URL, url)
1829 raise ExtractorError(u'Invalid URL: %s' % url)
1830 video_id = mobj.group(1)
1832 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1833 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1834 redirect_url = urlh.geturl() + new_location
1835 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1837 title = self._html_search_regex(r'<title>(.*)</title>',
1838 webpage, u'title').split('/')[0].strip()
1841 info_url = "http://vbox7.com/play/magare.do"
1842 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1843 info_request = compat_urllib_request.Request(info_url, data)
1844 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1845 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1846 if info_response is None:
1847 raise ExtractorError(u'Unable to extract the media url')
1848 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1855 'thumbnail': thumbnail_url,
1859 def gen_extractors():
1860 """ Return a list of an instance of every supported extractor.
1861 The order does matter; the first extractor matched is the one handling the URL.
1864 YoutubePlaylistIE(),
1889 StanfordOpenClassroomIE(),
1899 WorldStarHipHopIE(),
1929 def get_info_extractor(ie_name):
1930 """Returns the info extractor class with the given ie_name"""
1931 return globals()[ie_name+'IE']