Merge pull request #922 from JohnyMoSwag/master
[youtube-dl] / youtube_dl / extractor / googleplus.py
1 import datetime
2 import re
3
4 from .common import InfoExtractor
5 from ..utils import (
6     ExtractorError,
7 )
8
9
10 class GooglePlusIE(InfoExtractor):
11     """Information extractor for plus.google.com."""
12
13     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
14     IE_NAME = u'plus.google'
15
16     def _real_extract(self, url):
17         # Extract id from URL
18         mobj = re.match(self._VALID_URL, url)
19         if mobj is None:
20             raise ExtractorError(u'Invalid URL: %s' % url)
21
22         post_url = mobj.group(0)
23         video_id = mobj.group(1)
24
25         video_extension = 'flv'
26
27         # Step 1, Retrieve post webpage to extract further information
28         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
29
30         self.report_extraction(video_id)
31
32         # Extract update date
33         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
34             webpage, u'upload date', fatal=False)
35         if upload_date:
36             # Convert timestring to a format suitable for filename
37             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
38             upload_date = upload_date.strftime('%Y%m%d')
39
40         # Extract uploader
41         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
42             webpage, u'uploader', fatal=False)
43
44         # Extract title
45         # Get the first line for title
46         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
47             webpage, 'title', default=u'NA')
48
49         # Step 2, Simulate clicking the image box to launch video
50         DOMAIN = 'https://plus.google.com'
51         video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN),
52             webpage, u'video page URL')
53         if not video_page.startswith(DOMAIN):
54             video_page = DOMAIN + video_page
55
56         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
57
58         # Extract video links on video page
59         """Extract video links of all sizes"""
60         pattern = r'\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
61         mobj = re.findall(pattern, webpage)
62         if len(mobj) == 0:
63             raise ExtractorError(u'Unable to extract video links')
64
65         # Sort in resolution
66         links = sorted(mobj)
67
68         # Choose the lowest of the sort, i.e. highest resolution
69         video_url = links[-1]
70         # Only get the url. The resolution part in the tuple has no use anymore
71         video_url = video_url[-1]
72         # Treat escaped \u0026 style hex
73         try:
74             video_url = video_url.decode("unicode_escape")
75         except AttributeError: # Python 3
76             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
77
78
79         return [{
80             'id':       video_id,
81             'url':      video_url,
82             'uploader': uploader,
83             'upload_date':  upload_date,
84             'title':    video_title,
85             'ext':      video_extension,
86         }]