Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ func main() {

// Flag for profile scraping
scrapeProfiles := flag.Bool("profiles", false, "Alongside -scrape, signifies that professor profiles should be scraped.")
// Flag for soc scraping
scrapeOrganizations := flag.Bool("organizations", false, "Alongside -scrape, signifies that SOC organizations should be scraped.")
// Flag for calendar scraping and parsing
cometCalendar := flag.Bool("cometCalendar", false, "Alongside -scrape or -parse, signifies that the Comet Calendar should be scraped/parsed.")
// Flag for astra scraping and parsing
Expand Down Expand Up @@ -106,8 +104,6 @@ func main() {
log.Panic("No term specified for coursebook scraping! Use -term to specify.")
}
scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume)
case *scrapeOrganizations:
scrapers.ScrapeOrganizations(*outDir)
case *cometCalendar:
scrapers.ScrapeCometCalendar(*outDir)
case *astra:
Expand Down
14 changes: 5 additions & 9 deletions parser/cometCalendarParser.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import (
"regexp"
"slices"
"strings"
"time"

"github.com/UTDNebula/api-tools/scrapers"
"github.com/UTDNebula/api-tools/utils"
Expand Down Expand Up @@ -137,6 +136,7 @@ var DefaultValid []string = []string{
"RCW",
}

// ParseCometCalendar reformats the comet calendar data into uploadable json in Mongo
func ParseCometCalendar(inDir string, outDir string) {

calendarFile, err := os.ReadFile(inDir + "/cometCalendarScraped.json")
Expand Down Expand Up @@ -251,21 +251,18 @@ func ParseCometCalendar(inDir string, outDir string) {
utils.WriteJSON(fmt.Sprintf("%s/cometCalendar.json", outDir), result)
}

// getAbbreviations dynamically retrieves the all of the locations abbreviations
// getLocationAbbreviations dynamically retrieves the all of the locations abbreviations
func getLocationAbbreviations(inDir string) (map[string]string, []string, error) {
// Get the locations from the map scraper
var mapFile []byte

mapFile, err := os.ReadFile(inDir + "/mapLocations.json")
if err != nil {
if os.IsNotExist(err) {
// Scrape the data if the it doesn't exist yet and then get the map file
// Force scrape the locations if it doesn't exist. Get the map file again
scrapers.ScrapeMapLocations(inDir)
time.Sleep(2 * time.Second)
ParseMapLocations(inDir, inDir)
time.Sleep(2 * time.Second)

// If fail to get the locations again, it's not because location is unscraped
// If it fails to get the locations again, it's not because location is unscraped
mapFile, err = os.ReadFile(inDir + "/mapLocations.json")
if err != nil {
return nil, nil, err
Expand All @@ -274,7 +271,6 @@ func getLocationAbbreviations(inDir string) (map[string]string, []string, error)
return nil, nil, err
}
}

var locations []schema.MapBuilding
if err = json.Unmarshal(mapFile, &locations); err != nil {
return nil, nil, err
Expand All @@ -288,7 +284,7 @@ func getLocationAbbreviations(inDir string) (map[string]string, []string, error)
// Trim the following acronym in the name
trimmedName := strings.Split(*location.Name, " (")[0]
// Fallback on the locations that have no acronyms
abbreviation := ""
var abbreviation string
if location.Acronym != nil {
abbreviation = *location.Acronym
}
Expand Down
2 changes: 1 addition & 1 deletion parser/gradeLoader.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func csvToMap(csvFile *os.File, logFile *os.File) map[string][]int {
// optional columns
for _, name := range []string{"W", "P", "CR", "NC", "I"} {
if _, ok := indexMap[name]; !ok {
logFile.WriteString(fmt.Sprintf("could not find %s column\n", name))
fmt.Fprintf(logFile, "could not find %s column\n", name)
}
}

Expand Down
59 changes: 47 additions & 12 deletions scrapers/adacemicCalendars.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,34 @@ func ScrapeAcademicCalendars(outDir string) {
}

// Go to listings page
chromedp.RunResponse(chromedpCtx,
_, err = chromedp.RunResponse(chromedpCtx,
chromedp.Navigate(`https://www.utdallas.edu/academics/calendar/`),
)
if err != nil {
panic(err)
}

// Selector for the scraping the calendar nodes
currentSel := `a.wp-block-button__link`
futureSel := `//h2[normalize-space(text())="Future Terms"]/following-sibling::ul[1]//a`
pastSel := `//h2[normalize-space(text())="Future Terms"]/following-sibling::ul[1]//a`

// Extract data from links
// Current
academicCalendars := []AcademicCalendar{{"", "", "current"}}
chromedp.Run(chromedpCtx, chromedp.TextContent("h2.wp-block-heading", &academicCalendars[0].Title, chromedp.ByQuery))
err = chromedp.Run(chromedpCtx,
chromedp.TextContent("h2.wp-block-heading", &academicCalendars[0].Title, chromedp.ByQuery),
)
if err != nil {
panic(err)
}
var currentNode []*cdp.Node
chromedp.Run(chromedpCtx, chromedp.Nodes("a.wp-block-button__link", &currentNode, chromedp.ByQuery))
err = chromedp.Run(chromedpCtx,
chromedp.Nodes(currentSel, &currentNode, chromedp.ByQuery),
)
if err != nil {
panic(err)
}
for i := 0; i < len(currentNode[0].Attributes); i += 2 {
if currentNode[0].Attributes[i] == "href" {
academicCalendars[0].Href = currentNode[0].Attributes[i+1]
Expand All @@ -59,29 +77,42 @@ func ScrapeAcademicCalendars(outDir string) {

// Future list
var futureNodes []*cdp.Node
chromedp.Run(chromedpCtx,
chromedp.Nodes(`//h2[normalize-space(text())="Future Terms"]/following-sibling::ul[1]//a`, &futureNodes, chromedp.BySearch),
err = chromedp.Run(chromedpCtx,
chromedp.Nodes(futureSel, &futureNodes, chromedp.BySearch),
)
academicCalendars = append(academicCalendars, extractTextAndHref(futureNodes, "future", chromedpCtx)...)
if err != nil {
panic(err)
}
newCalendars := extractTextAndHref(futureNodes, "future", chromedpCtx)
academicCalendars = append(academicCalendars, newCalendars...)

// Past list
var pastNodes []*cdp.Node
chromedp.Run(chromedpCtx,
chromedp.Nodes(`//h2[normalize-space(text())="Past Terms"]/following-sibling::div[1]//a`, &pastNodes, chromedp.BySearch),
err = chromedp.Run(chromedpCtx,
chromedp.Nodes(pastSel, &pastNodes, chromedp.BySearch),
)
academicCalendars = append(academicCalendars, extractTextAndHref(pastNodes, "past", chromedpCtx)...)
if err != nil {
panic(err)
}
newCalendars = extractTextAndHref(pastNodes, "past", chromedpCtx)
academicCalendars = append(academicCalendars, newCalendars...)

// Don't need ChromeDP anymore
cancel()

// Download all PDFs
for _, academicCalendar := range academicCalendars {
downloadPdfFromBox(academicCalendar.Href, academicCalendar.Time+"-"+academicCalendar.Title, outSubDir)
downloadPdfFromBox(
academicCalendar.Href,
academicCalendar.Time+"-"+academicCalendar.Title,
outSubDir,
)
}
}

func extractTextAndHref(nodes []*cdp.Node, time string, chromedpCtx context.Context) []AcademicCalendar {
output := []AcademicCalendar{}
var err error

// Extract href and text
for _, n := range nodes {
Expand All @@ -93,8 +124,12 @@ func extractTextAndHref(nodes []*cdp.Node, time string, chromedpCtx context.Cont
}
}
// Get inner text
chromedp.Run(chromedpCtx, chromedp.TextContent(fmt.Sprintf(`a[href="%s"]`, href), &text, chromedp.ByQuery))

err = chromedp.Run(chromedpCtx,
chromedp.TextContent(fmt.Sprintf(`a[href="%s"]`, href), &text, chromedp.ByQuery),
)
if err != nil {
panic(err)
}
output = append(output, AcademicCalendar{text, href, time})
}

Expand Down
61 changes: 28 additions & 33 deletions scrapers/cometCalendar.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
"go.mongodb.org/mongo-driver/bson/primitive"
)

const CAL_URL string = "https://calendar.utdallas.edu/api/2/events"
const COMET_CALENDAR_URL string = "https://calendar.utdallas.edu/api/2/events"

// RawEvent mirrors the nested event payload returned by the calendar API.
type RawEvent struct {
Expand Down Expand Up @@ -66,16 +66,16 @@ func ScrapeCometCalendar(outDir string) {

calendarEvents = append(calendarEvents, schema.Event{
Id: primitive.NewObjectID(),
Summary: convert[string](rawEvent.Event["title"]),
Summary: to[string](rawEvent.Event["title"]),
Location: getEventLocation(rawEvent),
StartTime: startTime,
EndTime: endTime,
Description: convert[string](rawEvent.Event["description_text"]),
Description: to[string](rawEvent.Event["description_text"]),
EventType: eventTypes,
TargetAudience: targetAudiences,
Topic: eventTopics,
EventTags: tags,
EventWebsite: convert[string](rawEvent.Event["url"]),
EventWebsite: to[string](rawEvent.Event["url"]),
Department: departments,
ContactName: contactInfo[0],
ContactEmail: contactInfo[1],
Expand All @@ -94,10 +94,10 @@ func ScrapeCometCalendar(outDir string) {
log.Printf("Finished scraping %d events successfully!\n\n", len(calendarEvents))
}

// scrapeAndUnmarshal fetches a calendar page and decodes it into data.
// callAndUnmarshal fetches a calendar page and decodes it into data.
func callAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) error {
// Call API to get the byte data
calendarUrl := fmt.Sprintf("%s?days=365&pp=100&page=%d", CAL_URL, page)
calendarUrl := fmt.Sprintf("%s?days=365&pp=100&page=%d", COMET_CALENDAR_URL, page)
request, err := http.NewRequest("GET", calendarUrl, nil)
if err != nil {
return err
Expand Down Expand Up @@ -130,35 +130,30 @@ func callAndUnmarshal(client *http.Client, page int, data *APICalendarResponse)

// getTime parses the start and end time of the event
func getTime(event RawEvent) (time.Time, time.Time) {
instance := convert[map[string]any](
convert[map[string]any](
convert[[]any](event.Event["event_instances"])[0])["event_instance"])
instance := to[map[string]any](to[map[string]any](to[[]any](event.Event["event_instances"])[0])["event_instance"])

// Converts RFC3339 timestamp string to time.Time
startTime, err := time.Parse(time.RFC3339, convert[string](instance["start"]))
startTime, err := time.Parse(time.RFC3339, to[string](instance["start"]))
if err != nil {
panic(err)
}

var endTime time.Time
if convert[string](instance["end"]) != "" {
endTime, err = time.Parse(time.RFC3339, convert[string](instance["end"]))
endTime := startTime
if to[string](instance["end"]) != "" {
endTime, err = time.Parse(time.RFC3339, to[string](instance["end"]))
if err != nil {
panic(err)
}
} else {
endTime = startTime
}

return startTime, endTime
}

// getEventLocation parses the location of the event
func getEventLocation(event RawEvent) string {
building := convert[string](event.Event["location_name"])
room := convert[string](event.Event["room_number"])
building := to[string](event.Event["location_name"])
room := to[string](event.Event["room_number"])
location := strings.Trim(fmt.Sprintf("%s, %s", building, room), " ,")

return location
}

Expand All @@ -168,21 +163,21 @@ func getFilters(event RawEvent) ([]string, []string, []string) {
audiences := []string{}
topics := []string{}

filters := convert[map[string]any](event.Event["filters"])
filters := to[map[string]any](event.Event["filters"])

rawTypes := convert[[]any](filters["event_types"])
rawTypes := to[[]any](filters["event_types"])
for _, rawType := range rawTypes {
types = append(types, convert[string](convert[map[string]any](rawType)["name"]))
types = append(types, to[string](to[map[string]any](rawType)["name"]))
}

rawAudiences := convert[[]any](filters["event_target_audience"])
rawAudiences := to[[]any](filters["event_target_audience"])
for _, audience := range rawAudiences {
audiences = append(audiences, convert[string](convert[map[string]any](audience)["name"]))
audiences = append(audiences, to[string](to[map[string]any](audience)["name"]))
}

rawTopics := convert[[]any](filters["event_topic"])
rawTopics := to[[]any](filters["event_topic"])
for _, topic := range rawTopics {
topics = append(topics, convert[string](convert[map[string]any](topic)["name"]))
topics = append(topics, to[string](to[map[string]any](topic)["name"]))
}

return types, audiences, topics
Expand All @@ -193,14 +188,14 @@ func getDepartmentsAndTags(event RawEvent) ([]string, []string) {
departments := []string{}
tags := []string{}

rawTags := convert[[]any](event.Event["tags"])
rawTags := to[[]any](event.Event["tags"])
for _, tag := range rawTags {
tags = append(tags, convert[string](tag))
tags = append(tags, to[string](tag))
}

rawDeparments := convert[[]any](event.Event["departments"])
rawDeparments := to[[]any](event.Event["departments"])
for _, deparment := range rawDeparments {
departments = append(departments, convert[string](convert[map[string]any](deparment)["name"]))
departments = append(departments, to[string](to[map[string]any](deparment)["name"]))
}

return departments, tags
Expand All @@ -211,20 +206,20 @@ func getContactInfo(event RawEvent) [3]string {
// Note that some events won't have contact phone number
contactInfo := [3]string{}

rawContactInfo := convert[map[string]any](event.Event["custom_fields"])
rawContactInfo := to[map[string]any](event.Event["custom_fields"])
for i, infoField := range []string{
"contact_information_name",
"contact_information_email",
"contact_information_phone",
} {
contactInfo[i] = convert[string](rawContactInfo[infoField])
contactInfo[i] = to[string](rawContactInfo[infoField])
}

return contactInfo
}

// convert() attempts to convert data into types for this scraper
func convert[T []any | map[string]any | string](data any) T {
// to attempts to convert data into types for this scraper, or return nil value
func to[T []any | map[string]any | string](data any) T {
if newTypedData, ok := data.(T); ok {
return newTypedData
}
Expand Down
Loading