feat: support scraping date
This commit is contained in:
parent
e5eed06234
commit
65cca55a86
12
events.json
12
events.json
|
@ -1,16 +1,16 @@
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"title": "Logos Assembly: Belgrade",
|
"title": "Logos Assembly: Belgrade",
|
||||||
"date": "Jun 3, 6:30 PM GMT+2",
|
|
||||||
"location": "Beograd",
|
"location": "Beograd",
|
||||||
"href": "https://lu.mahttps://lu.ma/LA2",
|
"href": "https://lu.ma/LA2",
|
||||||
"thumbnail": "https://images.lumacdn.com/cdn-cgi/image/format=auto,fit=cover,dpr=2,quality=75,width=180,height=180/event-covers/e9/6d3a2f12-1c8c-42e9-8196-c1d240948030"
|
"thumbnail": "https://images.lumacdn.com/cdn-cgi/image/format=auto,fit=cover,dpr=2,quality=75,width=180,height=180/event-covers/e9/6d3a2f12-1c8c-42e9-8196-c1d240948030",
|
||||||
|
"date": "Jun 3\n4:30 PM"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Logos Assembly: Brno",
|
"title": "Logos Assembly: Brno",
|
||||||
"date": "Jun 13, 6:00 PM GMT+2",
|
|
||||||
"location": "Studentský klub U Kachničky",
|
"location": "Studentský klub U Kachničky",
|
||||||
"href": "https://lu.mahttps://lu.ma/la3",
|
"href": "https://lu.ma/la3",
|
||||||
"thumbnail": "https://images.lumacdn.com/cdn-cgi/image/format=auto,fit=cover,dpr=2,quality=75,width=180,height=180/event-covers/16/6718e8d5-8ce9-4247-8dbb-4f6e8d60392d"
|
"thumbnail": "https://images.lumacdn.com/cdn-cgi/image/format=auto,fit=cover,dpr=2,quality=75,width=180,height=180/event-covers/16/6718e8d5-8ce9-4247-8dbb-4f6e8d60392d",
|
||||||
|
"date": "Jun 13\n4:00 PM"
|
||||||
}
|
}
|
||||||
]
|
]
|
127
scrape.js
127
scrape.js
|
@ -1,61 +1,116 @@
|
||||||
|
const fetch = require('node-fetch')
|
||||||
const fs = require('fs')
|
const fs = require('fs')
|
||||||
const puppeteer = require('puppeteer')
|
const { JSDOM } = require('jsdom')
|
||||||
|
|
||||||
// URL to scrape
|
// URL to scrape
|
||||||
const url = 'https://lu.ma/logosevents'
|
const url = 'https://lu.ma/logosevents'
|
||||||
|
|
||||||
|
const urlCompact = 'https://lu.ma/logosevents?compact=true'
|
||||||
|
|
||||||
// Function to fetch and parse HTML
|
// Function to fetch and parse HTML
|
||||||
async function scrapeData() {
|
async function scrapeData() {
|
||||||
try {
|
try {
|
||||||
// Launch Puppeteer
|
const response = await fetch(url)
|
||||||
const browser = await puppeteer.launch()
|
const html = await response.text()
|
||||||
const page = await browser.newPage()
|
|
||||||
await page.goto(url, { waitUntil: 'networkidle2' })
|
|
||||||
|
|
||||||
// Wait for the required elements to load
|
const dom = new JSDOM(html)
|
||||||
await page.waitForSelector('a.event-link.content-link')
|
const document = dom.window.document
|
||||||
|
|
||||||
// Scrape the data
|
const events = []
|
||||||
const events = await page.evaluate(() => {
|
|
||||||
const eventLinks = document.querySelectorAll('a.event-link.content-link')
|
|
||||||
const events = []
|
|
||||||
|
|
||||||
eventLinks.forEach(eventLink => {
|
// Select elements with the .event-link class
|
||||||
const title = eventLink.getAttribute('aria-label')
|
const eventLinks = document.querySelectorAll('a.event-link.content-link')
|
||||||
const href = eventLink.href
|
|
||||||
const eventContent = eventLink.nextElementSibling
|
|
||||||
const location = eventContent
|
|
||||||
.querySelector('.attribute:nth-of-type(2) > .text-ellipses')
|
|
||||||
?.textContent.trim()
|
|
||||||
const thumbnail = eventContent.querySelector('img')?.src
|
|
||||||
|
|
||||||
const date = eventContent
|
eventLinks.forEach(eventLink => {
|
||||||
.querySelector('.event-time .text-warning')
|
const title = eventLink.getAttribute('aria-label')
|
||||||
?.textContent.trim()
|
|
||||||
|
|
||||||
events.push({
|
const href = eventLink.href
|
||||||
title,
|
|
||||||
date,
|
const eventContent = eventLink.nextElementSibling
|
||||||
location,
|
|
||||||
href: `https://lu.ma${href}`,
|
const location = eventContent
|
||||||
thumbnail,
|
.querySelector('.attribute:nth-of-type(2) > .text-ellipses')
|
||||||
})
|
?.textContent.trim()
|
||||||
|
|
||||||
|
const thumbnail = eventContent.querySelector('img')?.src
|
||||||
|
|
||||||
|
const date = eventLink
|
||||||
|
.querySelector('.jsx-2921306942 > .date')
|
||||||
|
?.textContent.trim()
|
||||||
|
|
||||||
|
// Push the extracted data to the events array
|
||||||
|
events.push({
|
||||||
|
title,
|
||||||
|
date: date,
|
||||||
|
location,
|
||||||
|
href: `https://lu.ma${href}`,
|
||||||
|
thumbnail,
|
||||||
})
|
})
|
||||||
|
|
||||||
return events
|
|
||||||
})
|
})
|
||||||
|
|
||||||
// Write data to a JSON file
|
// Write data to a JSON file
|
||||||
fs.writeFileSync('events.json', JSON.stringify(events, null, 2))
|
fs.writeFileSync('events.json', JSON.stringify(events, null, 2))
|
||||||
|
|
||||||
console.log('Data scraped and saved to events.json')
|
console.log('Data scraped and saved to events.json')
|
||||||
|
|
||||||
// Close Puppeteer
|
|
||||||
await browser.close()
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error scraping data:', error)
|
console.error('Error scraping data:', error)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Execute the function
|
async function scrapeEventDate() {
|
||||||
scrapeData()
|
try {
|
||||||
|
const response = await fetch(urlCompact)
|
||||||
|
const html = await response.text()
|
||||||
|
|
||||||
|
const dom = new JSDOM(html)
|
||||||
|
const document = dom.window.document
|
||||||
|
|
||||||
|
const events = []
|
||||||
|
|
||||||
|
console.log('html', html)
|
||||||
|
const eventData = document.querySelectorAll('.section')
|
||||||
|
|
||||||
|
eventData?.forEach(event => {
|
||||||
|
const date = event
|
||||||
|
.querySelector('.date-inner > .date')
|
||||||
|
?.textContent.trim()
|
||||||
|
|
||||||
|
const time = event
|
||||||
|
.querySelector('.time.text-tertiary-alpha')
|
||||||
|
?.textContent.trim()
|
||||||
|
|
||||||
|
console.log('time', time)
|
||||||
|
|
||||||
|
const href = event.querySelector('a')?.href
|
||||||
|
|
||||||
|
if (date && href) {
|
||||||
|
events.push({
|
||||||
|
date: `${date}\n${time}`,
|
||||||
|
href: `https://lu.ma${href}`,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update events.json with the date
|
||||||
|
const eventsData = JSON.parse(fs.readFileSync('events.json'))
|
||||||
|
|
||||||
|
const result = eventsData.map(eventData => {
|
||||||
|
const eventDate = events.find(event => event.href === eventData.href)
|
||||||
|
return {
|
||||||
|
...eventData,
|
||||||
|
date: eventDate?.date,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
fs.writeFileSync('events.json', JSON.stringify(result, null, 2))
|
||||||
|
})
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error scraping data:', error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
await scrapeData()
|
||||||
|
await scrapeEventDate()
|
||||||
|
}
|
||||||
|
|
||||||
|
main()
|
||||||
|
|
Loading…
Reference in New Issue