[youtube] Improve tags extraction and add test
This commit is contained in:
		| @@ -329,6 +329,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|                 'upload_date': '20121002', |                 'upload_date': '20121002', | ||||||
|                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', |                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', | ||||||
|                 'categories': ['Science & Technology'], |                 'categories': ['Science & Technology'], | ||||||
|  |                 'tags': ['youtube-dl'], | ||||||
|                 'like_count': int, |                 'like_count': int, | ||||||
|                 'dislike_count': int, |                 'dislike_count': int, | ||||||
|                 'start_time': 1, |                 'start_time': 1, | ||||||
| @@ -343,7 +344,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|                 'ext': 'mp4', |                 'ext': 'mp4', | ||||||
|                 'upload_date': '20120506', |                 'upload_date': '20120506', | ||||||
|                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', |                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', | ||||||
|                 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', |                 'description': 'md5:782e8651347686cba06e58f71ab51773', | ||||||
|  |                 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', | ||||||
|  |                          'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', | ||||||
|  |                          'iconic ep', 'iconic', 'love', 'it'], | ||||||
|                 'uploader': 'Icona Pop', |                 'uploader': 'Icona Pop', | ||||||
|                 'uploader_id': 'IconaPop', |                 'uploader_id': 'IconaPop', | ||||||
|             } |             } | ||||||
| @@ -1072,8 +1076,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|         else: |         else: | ||||||
|             video_categories = None |             video_categories = None | ||||||
|  |  | ||||||
|         video_tags = re.findall(r'''<meta(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?\s+property=['"]?og:video:tag['"]?(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?\s+content=['"]?([^>'"]+?)['"]?\s*>''' |         video_tags = [ | ||||||
|         , video_webpage, re.DOTALL | re.IGNORECASE); |             unescapeHTML(m.group('content')) | ||||||
|  |             for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] | ||||||
|  |  | ||||||
|         # description |         # description | ||||||
|         video_description = get_element_by_id("eow-description", video_webpage) |         video_description = get_element_by_id("eow-description", video_webpage) | ||||||
|         if video_description: |         if video_description: | ||||||
| @@ -1261,8 +1267,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|             'title': video_title, |             'title': video_title, | ||||||
|             'thumbnail': video_thumbnail, |             'thumbnail': video_thumbnail, | ||||||
|             'description': video_description, |             'description': video_description, | ||||||
|             'tags' : video_tags, |  | ||||||
|             'categories': video_categories, |             'categories': video_categories, | ||||||
|  |             'tags': video_tags, | ||||||
|             'subtitles': video_subtitles, |             'subtitles': video_subtitles, | ||||||
|             'automatic_captions': automatic_captions, |             'automatic_captions': automatic_captions, | ||||||
|             'duration': video_duration, |             'duration': video_duration, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Sergey M․
					Sergey M․