[youtube] improve Youtube Music Auto-generated description parsing(closes #20742)
This commit is contained in:
parent
5caabd3c70
commit
822b9d9cb0
@ -1088,7 +1088,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
# artist and track fields should return non-null, per issue #20599
|
# Youtube Music Auto-generated description
|
||||||
'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
|
'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'MgNrAu2pzNs',
|
'id': 'MgNrAu2pzNs',
|
||||||
@ -1109,11 +1109,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
# Youtube Music Auto-generated description
|
||||||
# Retrieve 'artist' field from 'Artist:' in video description
|
# Retrieve 'artist' field from 'Artist:' in video description
|
||||||
# when it is present on youtube music video
|
# when it is present on youtube music video
|
||||||
# Some videos have release_date and no release_year -
|
|
||||||
# (release_year should be extracted from release_date)
|
|
||||||
# https://github.com/ytdl-org/youtube-dl/pull/20742#issuecomment-485740932
|
|
||||||
'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
|
'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'k0jLE7tTwjY',
|
'id': 'k0jLE7tTwjY',
|
||||||
@ -1134,6 +1132,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
# Youtube Music Auto-generated description
|
||||||
# handle multiple artists on youtube music video
|
# handle multiple artists on youtube music video
|
||||||
'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
|
'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
@ -1155,6 +1154,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
# Youtube Music Auto-generated description
|
||||||
# handle youtube music video with release_year and no release_date
|
# handle youtube music video with release_year and no release_date
|
||||||
'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
|
'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
@ -2161,36 +2161,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||||||
|
|
||||||
track = extract_meta('Song')
|
track = extract_meta('Song')
|
||||||
artist = extract_meta('Artist')
|
artist = extract_meta('Artist')
|
||||||
album = None
|
|
||||||
release_date = None
|
|
||||||
release_year = None
|
|
||||||
|
|
||||||
description_info = video_description.split('\n\n')
|
# Youtube Music Auto-generated description
|
||||||
# If the description of the video has the youtube music auto-generated format, extract additional info
|
album = release_date = release_year = None
|
||||||
if len(description_info) >= 5 and description_info[-1] == 'Auto-generated by YouTube.':
|
if video_description:
|
||||||
track_artist = description_info[1].split(' · ')
|
mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
|
||||||
if len(track_artist) >= 2:
|
if mobj:
|
||||||
if track is None:
|
if not track:
|
||||||
track = track_artist[0]
|
track = mobj.group('track').strip()
|
||||||
if artist is None:
|
if not artist:
|
||||||
artist = re.search(r'Artist: ([^\n]+)', description_info[-2])
|
artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
|
||||||
if artist:
|
album = mobj.group('album'.strip())
|
||||||
artist = artist.group(1)
|
release_year = mobj.group('release_year')
|
||||||
if artist is None:
|
release_date = mobj.group('release_date')
|
||||||
artist = track_artist[1]
|
if release_date:
|
||||||
# handle multiple artists
|
release_date = release_date.replace('-', '')
|
||||||
if len(track_artist) > 2:
|
if not release_year:
|
||||||
for i in range(2, len(track_artist)):
|
release_year = int(release_date[:4])
|
||||||
artist += ', %s' % track_artist[i]
|
if release_year:
|
||||||
release_year = re.search(r'℗ ([0-9]+)', video_description)
|
release_year = int(release_year)
|
||||||
if release_year:
|
|
||||||
release_year = int_or_none(release_year.group(1))
|
|
||||||
album = description_info[2]
|
|
||||||
if description_info[4].startswith('Released on: '):
|
|
||||||
release_date = description_info[4].split(': ')[1].replace('-', '')
|
|
||||||
# extract release_year from release_date if necessary
|
|
||||||
if release_year is None:
|
|
||||||
release_year = int_or_none(release_date[0:4])
|
|
||||||
|
|
||||||
m_episode = re.search(
|
m_episode = re.search(
|
||||||
r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
|
r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
|
||||||
|
Loading…
Reference in New Issue
Block a user