spektors_thesaurus_scraper/scrape.py
2023-04-21 19:21:00 +02:00

162 lines
4.4 KiB
Python
Executable file

#!/usr/bin/env python3
import requests
from shutil import copyfileobj
from os.path import exists, splitext
def download(url, filename):
if exists(filename):
print('exists:', filename)
return
else:
print('downloading:', filename)
with requests.get(url, stream=True) as stream:
with open(filename, 'wb') as file:
copyfileobj(stream.raw, file)
session = requests.Session()
response = session.get('https://www.spektorsthesaurus.com/songs')
session_id = session.cookies.get_dict()['svSession']
response = session.post(
'https://www.spektorsthesaurus.com/_api/cloud-data/v1/wix-data/collections/query',
headers={
'Accept': 'application/json',
},
cookies={
'svSession': session_id,
},
json={
'collectionName':'tblSongs',
'dataQuery':{
'filter':{
'$and':[]
},
'sort':[
{
'fieldName':'song_name',
'order':'ASC'
}
],
'paging':{
'offset':0,
'limit':999
},
'fields':[]
},
'options':{},
'includeReferencedItems':[],
'segment':'LIVE',
'appId':'e3c84d19-bfb6-4299-824a-3236a027d528'
},
)
response.raise_for_status()
for song in response.json()['items']:
print(song['song_name'])
song_id = song['song_id']
# live performances
response = session.post(
'https://www.spektorsthesaurus.com/_api/cloud-data/v1/wix-data/collections/query',
headers={
'Accept': 'application/json',
},
cookies={
'svSession': session_id,
},
json={
'collectionName':'tblSongLivePerfs',
'dataQuery':{
'filter':{
'$and':[
{
'songId': {
'$eq': song_id,
}
},
{
'songInfo':{
'$ne': 'Aborted'
}
}
]
},
'sort':[
{
'fieldName':'eventDate',
'order':'ASC'
}
],
'paging':{
'offset':0,
'limit':999
},
'fields':[]
},
'options':{},
'includeReferencedItems':[],
'segment':'LIVE',
'appId':'e3c84d19-bfb6-4299-824a-3236a027d528'
},
)
response.raise_for_status()
for performance in response.json()['items']:
if 'bootleg' in performance:
_, extension = splitext(performance['bootleg'])
filename = f"{song['song_name']} - {performance['eventName']}{extension}".replace('/', '|')
download(performance['bootleg'], filename)
# demos
response = session.post(
'https://www.spektorsthesaurus.com/_api/cloud-data/v1/wix-data/collections/query',
headers={
'Accept': 'application/json',
},
cookies={
'svSession': session_id,
},
json={
'collectionName':'tblSongDemo',
'dataQuery':{
'filter':{
'$and':[
{
'song_id': {
'$eq': song_id,
}
},
]
},
'sort':[
{
'fieldName':'song_name',
'order':'ASC'
}
],
'paging':{
'offset':0,
'limit':999
},
'fields':[]
},
'options':{},
'includeReferencedItems':[],
'segment':'LIVE',
'appId':'e3c84d19-bfb6-4299-824a-3236a027d528'
},
)
response.raise_for_status()
for demo in response.json()['items']:
if 'link' in demo:
_, extension = splitext(demo['link'])
filename = f"{song['song_name']} - {demo['release_name']}{extension}".replace('/', '|')
download(demo['link'], filename)