spektors_thesaurus_scraper/scrape.py

#!/usr/bin/env python3

import requests
from shutil import copyfileobj
from os.path import exists, splitext


session = requests.Session()
response = session.get('https://www.spektorsthesaurus.com/songs')
session_id = session.cookies.get_dict()['svSession']

response = session.post(
    'https://www.spektorsthesaurus.com/_api/cloud-data/v1/wix-data/collections/query',
    headers={
        'Accept': 'application/json',
    },
    cookies={
        'svSession': session_id,
    },
    json={
        'collectionName':'tblSongs',
        'dataQuery':{
            'filter':{
                '$and':[]
            },
            'sort':[
                {
                    'fieldName':'song_name',
                    'order':'ASC'
                }
            ],
            'paging':{
                'offset':0,
                'limit':999
            },
            'fields':[]
        },
        'options':{},
        'includeReferencedItems':[],
        'segment':'LIVE',
        'appId':'e3c84d19-bfb6-4299-824a-3236a027d528'
    },
)
response.raise_for_status()

for song in response.json()['items']:
    print(song['song_name'])

    song_id = song['song_id']

    # live performances
    response = session.post(
        'https://www.spektorsthesaurus.com/_api/cloud-data/v1/wix-data/collections/query',
        headers={
            'Accept': 'application/json',
        },
        cookies={
            'svSession': session_id,
        },
        json={
            'collectionName':'tblSongLivePerfs',
            'dataQuery':{
                'filter':{
                    '$and':[
                        {
                            'songId': {
                                '$eq': song_id,
                            }
                        },
                        {
                            'songInfo':{
                                '$ne': 'Aborted'
                            }
                        }
                    ]
                },
                'sort':[
                    {
                        'fieldName':'eventDate',
                        'order':'ASC'
                    }
                ],
                'paging':{
                    'offset':0,
                    'limit':999
                },
                'fields':[]
            },
            'options':{},
            'includeReferencedItems':[],
            'segment':'LIVE',
            'appId':'e3c84d19-bfb6-4299-824a-3236a027d528'
        },
    )
    response.raise_for_status()

    for performance in response.json()['items']:
        if 'bootleg' in performance:
            _, extension = splitext(performance['bootleg'])
            filename = f"{song['song_name']} - {performance['eventName']}{extension}".replace('/', '|')

            if exists(filename):
                print(filename, 'exists')
                continue
            else:
                print('downloading', filename)

            with requests.get(performance['bootleg'], stream=True) as stream:
                with open(filename, 'wb') as file:
                    copyfileobj(stream.raw, file)