Merge remote-tracking branch 'epitron/metadata-pp'

[youtube-dl] / youtube_dl / extractor / smotri.py
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py

index a589a893bea8b6c1f32f59d3c7d2b1edf24af740..99f5b19d2dd68e78aaf2f45f79fad60b8bb459dc 100644 (file)
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -1,5 +1,6 @@
  # encoding: utf-8
  
+import os.path
  import re
  import json
  import hashlib
@@ -10,6 +11,7 @@ from ..utils import (
      compat_urllib_parse,
      compat_urllib_request,
      ExtractorError,
+    url_basename,
  )
  
  
@@ -132,7 +134,16 @@ class SmotriIE(InfoExtractor):
          # We will extract some from the video web page instead
          video_page_url = 'http://' + mobj.group('url')
          video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page')
-        
+
+        # Warning if video is unavailable
+        warning = self._html_search_regex(
+            r'<div class="videoUnModer">(.*?)</div>', video_page,
+            u'warning message', default=None)
+        if warning is not None:
+            self._downloader.report_warning(
+                u'Video %s may not be available; smotri said: %s ' %
+                (video_id, warning))
+
          # Adult content
          if re.search(u'EroConfirmText">', video_page) is not None:
              self.report_age_confirmation()
@@ -148,38 +159,44 @@ class SmotriIE(InfoExtractor):
          # Extract the rest of meta data
          video_title = self._search_meta(u'name', video_page, u'title')
          if not video_title:
-            video_title = video_url.rsplit('/', 1)[-1]
+            video_title = os.path.splitext(url_basename(video_url))[0]
  
          video_description = self._search_meta(u'description', video_page)
          END_TEXT = u' на сайте Smotri.com'
-        if video_description.endswith(END_TEXT):
+        if video_description and video_description.endswith(END_TEXT):
              video_description = video_description[:-len(END_TEXT)]
          START_TEXT = u'Смотреть онлайн ролик '
-        if video_description.startswith(START_TEXT):
+        if video_description and video_description.startswith(START_TEXT):
              video_description = video_description[len(START_TEXT):]
          video_thumbnail = self._search_meta(u'thumbnail', video_page)
  
          upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date')
-        upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
-        video_upload_date = (
-            (
-                upload_date_m.group('year') +
-                upload_date_m.group('month') +
-                upload_date_m.group('day')
+        if upload_date_str:
+            upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
+            video_upload_date = (
+                (
+                    upload_date_m.group('year') +
+                    upload_date_m.group('month') +
+                    upload_date_m.group('day')
+                )
+                if upload_date_m else None
              )
-            if upload_date_m else None
-        )
+        else:
+            video_upload_date = None
          
          duration_str = self._search_meta(u'duration', video_page)
-        duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
-        video_duration = (
-            (
-                (int(duration_m.group('hours')) * 60 * 60) +
-                (int(duration_m.group('minutes')) * 60) +
-                int(duration_m.group('seconds'))
+        if duration_str:
+            duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
+            video_duration = (
+                (
+                    (int(duration_m.group('hours')) * 60 * 60) +
+                    (int(duration_m.group('minutes')) * 60) +
+                    int(duration_m.group('seconds'))
+                )
+                if duration_m else None
              )
-            if duration_m else None
-        )
+        else:
+            video_duration = None
          
          video_uploader = self._html_search_regex(
              u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>',