feat: support scraping date

This commit is contained in:
jinhojang6 2024-05-31 23:44:34 +09:00
parent e5eed06234
commit 65cca55a86
No known key found for this signature in database
GPG Key ID: 1762F21FE8B543F8
2 changed files with 97 additions and 42 deletions

View File

@ -1,16 +1,16 @@
[
{
"title": "Logos Assembly: Belgrade",
"date": "Jun 3, 6:30 PM GMT+2",
"location": "Beograd",
"href": "https://lu.mahttps://lu.ma/LA2",
"thumbnail": "https://images.lumacdn.com/cdn-cgi/image/format=auto,fit=cover,dpr=2,quality=75,width=180,height=180/event-covers/e9/6d3a2f12-1c8c-42e9-8196-c1d240948030"
"href": "https://lu.ma/LA2",
"thumbnail": "https://images.lumacdn.com/cdn-cgi/image/format=auto,fit=cover,dpr=2,quality=75,width=180,height=180/event-covers/e9/6d3a2f12-1c8c-42e9-8196-c1d240948030",
"date": "Jun 3\n4:30 PM"
},
{
"title": "Logos Assembly: Brno",
"date": "Jun 13, 6:00 PM GMT+2",
"location": "Studentský klub U Kachničky",
"href": "https://lu.mahttps://lu.ma/la3",
"thumbnail": "https://images.lumacdn.com/cdn-cgi/image/format=auto,fit=cover,dpr=2,quality=75,width=180,height=180/event-covers/16/6718e8d5-8ce9-4247-8dbb-4f6e8d60392d"
"href": "https://lu.ma/la3",
"thumbnail": "https://images.lumacdn.com/cdn-cgi/image/format=auto,fit=cover,dpr=2,quality=75,width=180,height=180/event-covers/16/6718e8d5-8ce9-4247-8dbb-4f6e8d60392d",
"date": "Jun 13\n4:00 PM"
}
]

127
scrape.js
View File

@ -1,61 +1,116 @@
const fetch = require('node-fetch')
const fs = require('fs')
const puppeteer = require('puppeteer')
const { JSDOM } = require('jsdom')
// URL to scrape
const url = 'https://lu.ma/logosevents'
const urlCompact = 'https://lu.ma/logosevents?compact=true'
// Function to fetch and parse HTML
async function scrapeData() {
try {
// Launch Puppeteer
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(url, { waitUntil: 'networkidle2' })
const response = await fetch(url)
const html = await response.text()
// Wait for the required elements to load
await page.waitForSelector('a.event-link.content-link')
const dom = new JSDOM(html)
const document = dom.window.document
// Scrape the data
const events = await page.evaluate(() => {
const eventLinks = document.querySelectorAll('a.event-link.content-link')
const events = []
const events = []
eventLinks.forEach(eventLink => {
const title = eventLink.getAttribute('aria-label')
const href = eventLink.href
const eventContent = eventLink.nextElementSibling
const location = eventContent
.querySelector('.attribute:nth-of-type(2) > .text-ellipses')
?.textContent.trim()
const thumbnail = eventContent.querySelector('img')?.src
// Select elements with the .event-link class
const eventLinks = document.querySelectorAll('a.event-link.content-link')
const date = eventContent
.querySelector('.event-time .text-warning')
?.textContent.trim()
eventLinks.forEach(eventLink => {
const title = eventLink.getAttribute('aria-label')
events.push({
title,
date,
location,
href: `https://lu.ma${href}`,
thumbnail,
})
const href = eventLink.href
const eventContent = eventLink.nextElementSibling
const location = eventContent
.querySelector('.attribute:nth-of-type(2) > .text-ellipses')
?.textContent.trim()
const thumbnail = eventContent.querySelector('img')?.src
const date = eventLink
.querySelector('.jsx-2921306942 > .date')
?.textContent.trim()
// Push the extracted data to the events array
events.push({
title,
date: date,
location,
href: `https://lu.ma${href}`,
thumbnail,
})
return events
})
// Write data to a JSON file
fs.writeFileSync('events.json', JSON.stringify(events, null, 2))
console.log('Data scraped and saved to events.json')
// Close Puppeteer
await browser.close()
} catch (error) {
console.error('Error scraping data:', error)
}
}
// Execute the function
scrapeData()
async function scrapeEventDate() {
try {
const response = await fetch(urlCompact)
const html = await response.text()
const dom = new JSDOM(html)
const document = dom.window.document
const events = []
console.log('html', html)
const eventData = document.querySelectorAll('.section')
eventData?.forEach(event => {
const date = event
.querySelector('.date-inner > .date')
?.textContent.trim()
const time = event
.querySelector('.time.text-tertiary-alpha')
?.textContent.trim()
console.log('time', time)
const href = event.querySelector('a')?.href
if (date && href) {
events.push({
date: `${date}\n${time}`,
href: `https://lu.ma${href}`,
})
}
// Update events.json with the date
const eventsData = JSON.parse(fs.readFileSync('events.json'))
const result = eventsData.map(eventData => {
const eventDate = events.find(event => event.href === eventData.href)
return {
...eventData,
date: eventDate?.date,
}
})
fs.writeFileSync('events.json', JSON.stringify(result, null, 2))
})
} catch (error) {
console.error('Error scraping data:', error)
}
}
async function main() {
await scrapeData()
await scrapeEventDate()
}
main()