Merge remote-tracking branch 'epitron/metadata-pp'
[youtube-dl] / youtube_dl / extractor / smotri.py
index a589a893bea8b6c1f32f59d3c7d2b1edf24af740..99f5b19d2dd68e78aaf2f45f79fad60b8bb459dc 100644 (file)
@@ -1,5 +1,6 @@
 # encoding: utf-8
 
+import os.path
 import re
 import json
 import hashlib
@@ -10,6 +11,7 @@ from ..utils import (
     compat_urllib_parse,
     compat_urllib_request,
     ExtractorError,
+    url_basename,
 )
 
 
@@ -132,7 +134,16 @@ class SmotriIE(InfoExtractor):
         # We will extract some from the video web page instead
         video_page_url = 'http://' + mobj.group('url')
         video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page')
-        
+
+        # Warning if video is unavailable
+        warning = self._html_search_regex(
+            r'<div class="videoUnModer">(.*?)</div>', video_page,
+            u'warning message', default=None)
+        if warning is not None:
+            self._downloader.report_warning(
+                u'Video %s may not be available; smotri said: %s ' %
+                (video_id, warning))
+
         # Adult content
         if re.search(u'EroConfirmText">', video_page) is not None:
             self.report_age_confirmation()
@@ -148,38 +159,44 @@ class SmotriIE(InfoExtractor):
         # Extract the rest of meta data
         video_title = self._search_meta(u'name', video_page, u'title')
         if not video_title:
-            video_title = video_url.rsplit('/', 1)[-1]
+            video_title = os.path.splitext(url_basename(video_url))[0]
 
         video_description = self._search_meta(u'description', video_page)
         END_TEXT = u' на сайте Smotri.com'
-        if video_description.endswith(END_TEXT):
+        if video_description and video_description.endswith(END_TEXT):
             video_description = video_description[:-len(END_TEXT)]
         START_TEXT = u'Смотреть онлайн ролик '
-        if video_description.startswith(START_TEXT):
+        if video_description and video_description.startswith(START_TEXT):
             video_description = video_description[len(START_TEXT):]
         video_thumbnail = self._search_meta(u'thumbnail', video_page)
 
         upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date')
-        upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
-        video_upload_date = (
-            (
-                upload_date_m.group('year') +
-                upload_date_m.group('month') +
-                upload_date_m.group('day')
+        if upload_date_str:
+            upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
+            video_upload_date = (
+                (
+                    upload_date_m.group('year') +
+                    upload_date_m.group('month') +
+                    upload_date_m.group('day')
+                )
+                if upload_date_m else None
             )
-            if upload_date_m else None
-        )
+        else:
+            video_upload_date = None
         
         duration_str = self._search_meta(u'duration', video_page)
-        duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
-        video_duration = (
-            (
-                (int(duration_m.group('hours')) * 60 * 60) +
-                (int(duration_m.group('minutes')) * 60) +
-                int(duration_m.group('seconds'))
+        if duration_str:
+            duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
+            video_duration = (
+                (
+                    (int(duration_m.group('hours')) * 60 * 60) +
+                    (int(duration_m.group('minutes')) * 60) +
+                    int(duration_m.group('seconds'))
+                )
+                if duration_m else None
             )
-            if duration_m else None
-        )
+        else:
+            video_duration = None
         
         video_uploader = self._html_search_regex(
             u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>',