Move G+ IE into its own file, and move google search into a more descriptive module
[youtube-dl] / youtube_dl / extractor / googleplus.py
1 import datetime
2 import re
3
4 from .common import InfoExtractor
5 from ..utils import (
6     ExtractorError,
7 )
8
9
10 class GooglePlusIE(InfoExtractor):
11     """Information extractor for plus.google.com."""
12
13     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
14     IE_NAME = u'plus.google'
15
16     def _real_extract(self, url):
17         # Extract id from URL
18         mobj = re.match(self._VALID_URL, url)
19         if mobj is None:
20             raise ExtractorError(u'Invalid URL: %s' % url)
21
22         post_url = mobj.group(0)
23         video_id = mobj.group(1)
24
25         video_extension = 'flv'
26
27         # Step 1, Retrieve post webpage to extract further information
28         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
29
30         self.report_extraction(video_id)
31
32         # Extract update date
33         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
34             webpage, u'upload date', fatal=False)
35         if upload_date:
36             # Convert timestring to a format suitable for filename
37             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
38             upload_date = upload_date.strftime('%Y%m%d')
39
40         # Extract uploader
41         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
42             webpage, u'uploader', fatal=False)
43
44         # Extract title
45         # Get the first line for title
46         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
47             webpage, 'title', default=u'NA')
48
49         # Step 2, Stimulate clicking the image box to launch video
50         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
51             webpage, u'video page URL')
52         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
53
54         # Extract video links on video page
55         """Extract video links of all sizes"""
56         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
57         mobj = re.findall(pattern, webpage)
58         if len(mobj) == 0:
59             raise ExtractorError(u'Unable to extract video links')
60
61         # Sort in resolution
62         links = sorted(mobj)
63
64         # Choose the lowest of the sort, i.e. highest resolution
65         video_url = links[-1]
66         # Only get the url. The resolution part in the tuple has no use anymore
67         video_url = video_url[-1]
68         # Treat escaped \u0026 style hex
69         try:
70             video_url = video_url.decode("unicode_escape")
71         except AttributeError: # Python 3
72             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
73
74
75         return [{
76             'id':       video_id,
77             'url':      video_url,
78             'uploader': uploader,
79             'upload_date':  upload_date,
80             'title':    video_title,
81             'ext':      video_extension,
82         }]