diff --git a/main.go b/main.go index 7d0af33..9ba21b6 100644 --- a/main.go +++ b/main.go @@ -36,8 +36,6 @@ func main() { // Flag for profile scraping scrapeProfiles := flag.Bool("profiles", false, "Alongside -scrape, signifies that professor profiles should be scraped.") - // Flag for soc scraping - scrapeOrganizations := flag.Bool("organizations", false, "Alongside -scrape, signifies that SOC organizations should be scraped.") // Flag for calendar scraping and parsing cometCalendar := flag.Bool("cometCalendar", false, "Alongside -scrape or -parse, signifies that the Comet Calendar should be scraped/parsed.") // Flag for astra scraping and parsing @@ -106,8 +104,6 @@ func main() { log.Panic("No term specified for coursebook scraping! Use -term to specify.") } scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume) - case *scrapeOrganizations: - scrapers.ScrapeOrganizations(*outDir) case *cometCalendar: scrapers.ScrapeCometCalendar(*outDir) case *astra: diff --git a/parser/cometCalendarParser.go b/parser/cometCalendarParser.go index 4575b6c..286ea67 100644 --- a/parser/cometCalendarParser.go +++ b/parser/cometCalendarParser.go @@ -12,7 +12,6 @@ import ( "regexp" "slices" "strings" - "time" "github.com/UTDNebula/api-tools/scrapers" "github.com/UTDNebula/api-tools/utils" @@ -137,6 +136,7 @@ var DefaultValid []string = []string{ "RCW", } +// ParseCometCalendar reformats the comet calendar data into uploadable json in Mongo func ParseCometCalendar(inDir string, outDir string) { calendarFile, err := os.ReadFile(inDir + "/cometCalendarScraped.json") @@ -251,21 +251,18 @@ func ParseCometCalendar(inDir string, outDir string) { utils.WriteJSON(fmt.Sprintf("%s/cometCalendar.json", outDir), result) } -// getAbbreviations dynamically retrieves the all of the locations abbreviations +// getLocationAbbreviations dynamically retrieves the all of the locations abbreviations func getLocationAbbreviations(inDir string) (map[string]string, []string, error) { // Get the locations from the map scraper var mapFile []byte - mapFile, err := os.ReadFile(inDir + "/mapLocations.json") if err != nil { if os.IsNotExist(err) { - // Scrape the data if the it doesn't exist yet and then get the map file + // Force scrape the locations if it doesn't exist. Get the map file again scrapers.ScrapeMapLocations(inDir) - time.Sleep(2 * time.Second) ParseMapLocations(inDir, inDir) - time.Sleep(2 * time.Second) - // If fail to get the locations again, it's not because location is unscraped + // If it fails to get the locations again, it's not because location is unscraped mapFile, err = os.ReadFile(inDir + "/mapLocations.json") if err != nil { return nil, nil, err @@ -274,7 +271,6 @@ func getLocationAbbreviations(inDir string) (map[string]string, []string, error) return nil, nil, err } } - var locations []schema.MapBuilding if err = json.Unmarshal(mapFile, &locations); err != nil { return nil, nil, err @@ -288,7 +284,7 @@ func getLocationAbbreviations(inDir string) (map[string]string, []string, error) // Trim the following acronym in the name trimmedName := strings.Split(*location.Name, " (")[0] // Fallback on the locations that have no acronyms - abbreviation := "" + var abbreviation string if location.Acronym != nil { abbreviation = *location.Acronym } diff --git a/parser/gradeLoader.go b/parser/gradeLoader.go index 57c7d7a..cff86a3 100644 --- a/parser/gradeLoader.go +++ b/parser/gradeLoader.go @@ -100,7 +100,7 @@ func csvToMap(csvFile *os.File, logFile *os.File) map[string][]int { // optional columns for _, name := range []string{"W", "P", "CR", "NC", "I"} { if _, ok := indexMap[name]; !ok { - logFile.WriteString(fmt.Sprintf("could not find %s column\n", name)) + fmt.Fprintf(logFile, "could not find %s column\n", name) } } diff --git a/scrapers/adacemicCalendars.go b/scrapers/adacemicCalendars.go index b641619..e321a07 100644 --- a/scrapers/adacemicCalendars.go +++ b/scrapers/adacemicCalendars.go @@ -41,16 +41,34 @@ func ScrapeAcademicCalendars(outDir string) { } // Go to listings page - chromedp.RunResponse(chromedpCtx, + _, err = chromedp.RunResponse(chromedpCtx, chromedp.Navigate(`https://www.utdallas.edu/academics/calendar/`), ) + if err != nil { + panic(err) + } + + // Selector for the scraping the calendar nodes + currentSel := `a.wp-block-button__link` + futureSel := `//h2[normalize-space(text())="Future Terms"]/following-sibling::ul[1]//a` + pastSel := `//h2[normalize-space(text())="Future Terms"]/following-sibling::ul[1]//a` // Extract data from links // Current academicCalendars := []AcademicCalendar{{"", "", "current"}} - chromedp.Run(chromedpCtx, chromedp.TextContent("h2.wp-block-heading", &academicCalendars[0].Title, chromedp.ByQuery)) + err = chromedp.Run(chromedpCtx, + chromedp.TextContent("h2.wp-block-heading", &academicCalendars[0].Title, chromedp.ByQuery), + ) + if err != nil { + panic(err) + } var currentNode []*cdp.Node - chromedp.Run(chromedpCtx, chromedp.Nodes("a.wp-block-button__link", ¤tNode, chromedp.ByQuery)) + err = chromedp.Run(chromedpCtx, + chromedp.Nodes(currentSel, ¤tNode, chromedp.ByQuery), + ) + if err != nil { + panic(err) + } for i := 0; i < len(currentNode[0].Attributes); i += 2 { if currentNode[0].Attributes[i] == "href" { academicCalendars[0].Href = currentNode[0].Attributes[i+1] @@ -59,29 +77,42 @@ func ScrapeAcademicCalendars(outDir string) { // Future list var futureNodes []*cdp.Node - chromedp.Run(chromedpCtx, - chromedp.Nodes(`//h2[normalize-space(text())="Future Terms"]/following-sibling::ul[1]//a`, &futureNodes, chromedp.BySearch), + err = chromedp.Run(chromedpCtx, + chromedp.Nodes(futureSel, &futureNodes, chromedp.BySearch), ) - academicCalendars = append(academicCalendars, extractTextAndHref(futureNodes, "future", chromedpCtx)...) + if err != nil { + panic(err) + } + newCalendars := extractTextAndHref(futureNodes, "future", chromedpCtx) + academicCalendars = append(academicCalendars, newCalendars...) // Past list var pastNodes []*cdp.Node - chromedp.Run(chromedpCtx, - chromedp.Nodes(`//h2[normalize-space(text())="Past Terms"]/following-sibling::div[1]//a`, &pastNodes, chromedp.BySearch), + err = chromedp.Run(chromedpCtx, + chromedp.Nodes(pastSel, &pastNodes, chromedp.BySearch), ) - academicCalendars = append(academicCalendars, extractTextAndHref(pastNodes, "past", chromedpCtx)...) + if err != nil { + panic(err) + } + newCalendars = extractTextAndHref(pastNodes, "past", chromedpCtx) + academicCalendars = append(academicCalendars, newCalendars...) // Don't need ChromeDP anymore cancel() // Download all PDFs for _, academicCalendar := range academicCalendars { - downloadPdfFromBox(academicCalendar.Href, academicCalendar.Time+"-"+academicCalendar.Title, outSubDir) + downloadPdfFromBox( + academicCalendar.Href, + academicCalendar.Time+"-"+academicCalendar.Title, + outSubDir, + ) } } func extractTextAndHref(nodes []*cdp.Node, time string, chromedpCtx context.Context) []AcademicCalendar { output := []AcademicCalendar{} + var err error // Extract href and text for _, n := range nodes { @@ -93,8 +124,12 @@ func extractTextAndHref(nodes []*cdp.Node, time string, chromedpCtx context.Cont } } // Get inner text - chromedp.Run(chromedpCtx, chromedp.TextContent(fmt.Sprintf(`a[href="%s"]`, href), &text, chromedp.ByQuery)) - + err = chromedp.Run(chromedpCtx, + chromedp.TextContent(fmt.Sprintf(`a[href="%s"]`, href), &text, chromedp.ByQuery), + ) + if err != nil { + panic(err) + } output = append(output, AcademicCalendar{text, href, time}) } diff --git a/scrapers/cometCalendar.go b/scrapers/cometCalendar.go index 3a4f613..e2ac39d 100644 --- a/scrapers/cometCalendar.go +++ b/scrapers/cometCalendar.go @@ -19,7 +19,7 @@ import ( "go.mongodb.org/mongo-driver/bson/primitive" ) -const CAL_URL string = "https://calendar.utdallas.edu/api/2/events" +const COMET_CALENDAR_URL string = "https://calendar.utdallas.edu/api/2/events" // RawEvent mirrors the nested event payload returned by the calendar API. type RawEvent struct { @@ -66,16 +66,16 @@ func ScrapeCometCalendar(outDir string) { calendarEvents = append(calendarEvents, schema.Event{ Id: primitive.NewObjectID(), - Summary: convert[string](rawEvent.Event["title"]), + Summary: to[string](rawEvent.Event["title"]), Location: getEventLocation(rawEvent), StartTime: startTime, EndTime: endTime, - Description: convert[string](rawEvent.Event["description_text"]), + Description: to[string](rawEvent.Event["description_text"]), EventType: eventTypes, TargetAudience: targetAudiences, Topic: eventTopics, EventTags: tags, - EventWebsite: convert[string](rawEvent.Event["url"]), + EventWebsite: to[string](rawEvent.Event["url"]), Department: departments, ContactName: contactInfo[0], ContactEmail: contactInfo[1], @@ -94,10 +94,10 @@ func ScrapeCometCalendar(outDir string) { log.Printf("Finished scraping %d events successfully!\n\n", len(calendarEvents)) } -// scrapeAndUnmarshal fetches a calendar page and decodes it into data. +// callAndUnmarshal fetches a calendar page and decodes it into data. func callAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) error { // Call API to get the byte data - calendarUrl := fmt.Sprintf("%s?days=365&pp=100&page=%d", CAL_URL, page) + calendarUrl := fmt.Sprintf("%s?days=365&pp=100&page=%d", COMET_CALENDAR_URL, page) request, err := http.NewRequest("GET", calendarUrl, nil) if err != nil { return err @@ -130,24 +130,20 @@ func callAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) // getTime parses the start and end time of the event func getTime(event RawEvent) (time.Time, time.Time) { - instance := convert[map[string]any]( - convert[map[string]any]( - convert[[]any](event.Event["event_instances"])[0])["event_instance"]) + instance := to[map[string]any](to[map[string]any](to[[]any](event.Event["event_instances"])[0])["event_instance"]) // Converts RFC3339 timestamp string to time.Time - startTime, err := time.Parse(time.RFC3339, convert[string](instance["start"])) + startTime, err := time.Parse(time.RFC3339, to[string](instance["start"])) if err != nil { panic(err) } - var endTime time.Time - if convert[string](instance["end"]) != "" { - endTime, err = time.Parse(time.RFC3339, convert[string](instance["end"])) + endTime := startTime + if to[string](instance["end"]) != "" { + endTime, err = time.Parse(time.RFC3339, to[string](instance["end"])) if err != nil { panic(err) } - } else { - endTime = startTime } return startTime, endTime @@ -155,10 +151,9 @@ func getTime(event RawEvent) (time.Time, time.Time) { // getEventLocation parses the location of the event func getEventLocation(event RawEvent) string { - building := convert[string](event.Event["location_name"]) - room := convert[string](event.Event["room_number"]) + building := to[string](event.Event["location_name"]) + room := to[string](event.Event["room_number"]) location := strings.Trim(fmt.Sprintf("%s, %s", building, room), " ,") - return location } @@ -168,21 +163,21 @@ func getFilters(event RawEvent) ([]string, []string, []string) { audiences := []string{} topics := []string{} - filters := convert[map[string]any](event.Event["filters"]) + filters := to[map[string]any](event.Event["filters"]) - rawTypes := convert[[]any](filters["event_types"]) + rawTypes := to[[]any](filters["event_types"]) for _, rawType := range rawTypes { - types = append(types, convert[string](convert[map[string]any](rawType)["name"])) + types = append(types, to[string](to[map[string]any](rawType)["name"])) } - rawAudiences := convert[[]any](filters["event_target_audience"]) + rawAudiences := to[[]any](filters["event_target_audience"]) for _, audience := range rawAudiences { - audiences = append(audiences, convert[string](convert[map[string]any](audience)["name"])) + audiences = append(audiences, to[string](to[map[string]any](audience)["name"])) } - rawTopics := convert[[]any](filters["event_topic"]) + rawTopics := to[[]any](filters["event_topic"]) for _, topic := range rawTopics { - topics = append(topics, convert[string](convert[map[string]any](topic)["name"])) + topics = append(topics, to[string](to[map[string]any](topic)["name"])) } return types, audiences, topics @@ -193,14 +188,14 @@ func getDepartmentsAndTags(event RawEvent) ([]string, []string) { departments := []string{} tags := []string{} - rawTags := convert[[]any](event.Event["tags"]) + rawTags := to[[]any](event.Event["tags"]) for _, tag := range rawTags { - tags = append(tags, convert[string](tag)) + tags = append(tags, to[string](tag)) } - rawDeparments := convert[[]any](event.Event["departments"]) + rawDeparments := to[[]any](event.Event["departments"]) for _, deparment := range rawDeparments { - departments = append(departments, convert[string](convert[map[string]any](deparment)["name"])) + departments = append(departments, to[string](to[map[string]any](deparment)["name"])) } return departments, tags @@ -211,20 +206,20 @@ func getContactInfo(event RawEvent) [3]string { // Note that some events won't have contact phone number contactInfo := [3]string{} - rawContactInfo := convert[map[string]any](event.Event["custom_fields"]) + rawContactInfo := to[map[string]any](event.Event["custom_fields"]) for i, infoField := range []string{ "contact_information_name", "contact_information_email", "contact_information_phone", } { - contactInfo[i] = convert[string](rawContactInfo[infoField]) + contactInfo[i] = to[string](rawContactInfo[infoField]) } return contactInfo } -// convert() attempts to convert data into types for this scraper -func convert[T []any | map[string]any | string](data any) T { +// to attempts to convert data into types for this scraper, or return nil value +func to[T []any | map[string]any | string](data any) T { if newTypedData, ok := data.(T); ok { return newTypedData } diff --git a/scrapers/organizations.go b/scrapers/organizations.go deleted file mode 100644 index 46aa833..0000000 --- a/scrapers/organizations.go +++ /dev/null @@ -1,302 +0,0 @@ -/* - This file contains the code for the student organization scraper. -*/ - -package scrapers - -import ( - "bufio" - "context" - "encoding/base64" - "encoding/csv" - "encoding/json" - "fmt" - "io" - "log" - "net/url" - "os" - "path/filepath" - "regexp" - "strings" - "time" - - "github.com/UTDNebula/api-tools/utils" - "github.com/UTDNebula/nebula-api/api/schema" - "github.com/chromedp/cdproto/browser" - "github.com/chromedp/cdproto/network" - "github.com/chromedp/chromedp" - "go.mongodb.org/mongo-driver/bson/primitive" -) - -const ( - socBaseUrl = `https://cometmail.sharepoint.com` - socLoginUrl = socBaseUrl + `/sites/StudentOrganizationCenterSP/Lists/Student%20Organization%20Directory/All%20Items%20gallery.aspx` - localPartCharClass = `[:alnum:]!#$%&'*+/=?^_` + "`" + `{|}~-` - subdomainPattern = `([[:alnum:]]([[:alnum:]-]*[[:alnum:]])?\.)+` - topdomainPattern = `[[:alnum:]]([[:alnum:]-]*[[:alnum:]])?` -) - -var ( - baseUrlStruct, _ = url.Parse(socBaseUrl) - localPartPattern = fmt.Sprintf(`[%[1]s]+(\.[%[1]s]+)*`, localPartCharClass) - emailRegex = regexp.MustCompile(fmt.Sprintf(`%s@%s%s`, localPartPattern, subdomainPattern, topdomainPattern)) -) - -// ScrapeOrganizations authenticates with SharePoint and exports the student organization directory CSV. -func ScrapeOrganizations(outdir string) { - log.Println("Scraping SOC ...") - ctx, cancel := utils.InitChromeDp() - defer cancel() - - if err := loginToSoc(ctx); err != nil { - panic(err) - } - if err := scrapeData(ctx, outdir); err != nil { - panic(err) - } -} - -func loginToSoc(ctx context.Context) error { - log.Println("Logging into SOC ...") - netID, err := utils.GetEnv("LOGIN_NETID") - if err != nil { - return err - } - password, err := utils.GetEnv("LOGIN_PASSWORD") - if err != nil { - return err - } - - _, err = chromedp.RunResponse(ctx, - network.ClearBrowserCookies(), - chromedp.Navigate(socLoginUrl), - chromedp.SendKeys(`input[type="email"]`, netID+"@utdallas.edu"), - chromedp.Click(`input[type="submit"]`), - chromedp.SendKeys(`input[type="password"]`, password), - // wait for sign in button to load (regular WaitVisible and WaitReady methods do not work) - chromedp.Sleep(1*time.Second), - chromedp.Click(`input[type="submit"]`), - chromedp.Sleep(2*time.Second), - chromedp.Click(`button.auth-button`), - chromedp.WaitReady(`body`), - ) - - return err -} - -func scrapeData(ctx context.Context, outdir string) error { - log.Println("Scraping data ...") - // download file method adapted from https://github.com/chromedp/examples/blob/master/download_file/main.go - timedCtx, cancel := context.WithTimeout(ctx, time.Minute) - defer cancel() - - done := make(chan string, 1) - // listen for download events - chromedp.ListenTarget(timedCtx, func(v interface{}) { - ev, ok := v.(*browser.EventDownloadProgress) - if !ok { - return - } - if ev.State == browser.DownloadProgressStateCompleted { - // stop listening for further download events and send guid - cancel() - done <- ev.GUID - close(done) - } - }) - - tempDir, _ := filepath.Abs(filepath.Join(outdir, "tmp")) - utils.VPrintf("Downloading CSV to %s ...", tempDir) - if err := os.MkdirAll(tempDir, 0755); err != nil { - return err - } - if err := chromedp.Run(ctx, - chromedp.Sleep(1*time.Second), - chromedp.Click(`button[name="Export"]`, chromedp.NodeReady), - browser.SetDownloadBehavior(browser.SetDownloadBehaviorBehaviorAllowAndName).WithDownloadPath(tempDir).WithEventsEnabled(true), - chromedp.Sleep(1*time.Second), - chromedp.Click(`button[name="Export to CSV"]`, chromedp.NodeReady), - ); err != nil { - return err - } - - // get GUID of download and reconstruct path - guid := <-done - guidPath := filepath.Join(tempDir, guid) - defer func() { - // remove temp file and directory - os.Remove(guidPath) - }() - - outPath := filepath.Join(outdir, "organizations.json") - - if err := processCsv(ctx, guidPath, outPath); err != nil { - return err - } - - return nil -} - -func processCsv(ctx context.Context, inputPath string, storageFilePath string) error { - // open csv for reading - csvFile, err := os.Open(inputPath) - if err != nil { - return err - } - - // init csv reader - bufReader := bufio.NewReader(csvFile) - // discard headers - if _, _, err := bufReader.ReadLine(); err != nil { - return err - } - csvReader := csv.NewReader(bufReader) - - // write to json - storageFile, err := os.Create(storageFilePath) - if err != nil { - return err - } - encoder := json.NewEncoder(bufio.NewWriter(storageFile)) - encoder.SetIndent("", "\t") - - var orgs []*schema.Organization - // process each row of csv - for i := 1; true; i++ { - entry, err := csvReader.Read() - if err != nil { - if err == io.EOF { - break - } - return err - } - - utils.VPrintf("Processing row %d", i) - org, err := parseCsvRecord(ctx, entry) - if err != nil { - return err - } - - orgs = append(orgs, org) - } - - // Write JSON to file - if err = encoder.Encode(orgs); err != nil { - return err - } - - if err := csvFile.Close(); err != nil { - return err - } - - if err := storageFile.Close(); err != nil { - return err - } - - return nil -} - -func parseCsvRecord(ctx context.Context, entry []string) (*schema.Organization, error) { - // initial cleaning - for i, v := range entry { - v = strings.ReplaceAll(v, "\u0026", "") - v = strings.TrimSpace(v) - entry[i] = v - } - - imageData, err := retrieveImage(ctx, entry[5]) - if err != nil { - utils.VPrintf("Error retrieving image for %s: %v", entry[0], err) - } - return &schema.Organization{ - Id: primitive.NewObjectID(), - Title: entry[0], - Categories: parseCategories(entry[1]), - Description: entry[2], - President_name: entry[3], - Emails: parseEmails(entry[4]), - Picture_data: imageData, - }, nil -} - -func parseCategories(cats string) []string { - cats = strings.TrimLeft(cats, "[") - cats = strings.TrimRight(cats, "]") - // strange character appears in csv; need to remove it - cats = strings.ReplaceAll(cats, `"`, "") - // split by comma - catsArray := strings.Split(cats, ",") - // strip whitespace from ends - for j, v := range catsArray { - catsArray[j] = strings.TrimSpace(v) - } - - return catsArray -} - -func parseEmails(emails string) []string { - return emailRegex.FindAllString(emails, -1) -} - -func retrieveImage(ctx context.Context, imageUri string) (string, error) { - if imageUri == "" { - return "", nil - } - - urlStruct, err := url.Parse(imageUri) - if err != nil { - return "", err - } - - requestUrl := baseUrlStruct.ResolveReference(urlStruct).String() - - //log.Printf("loading image %s", requestUrl) - // method adapted from https://github.com/chromedp/examples/blob/master/download_image/main.go - - ctx, cancel := context.WithTimeout(ctx, 10*time.Second) - defer cancel() - - done := make(chan bool) - - // this will be used to capture the request id for matching network events - var requestID network.RequestID - - // listen for network requests and choose desired - chromedp.ListenTarget(ctx, func(v interface{}) { - switch ev := v.(type) { - case *network.EventRequestWillBeSent: - if ev.Request.URL == requestUrl { - requestID = ev.RequestID - } - case *network.EventLoadingFinished: - if ev.RequestID == requestID { - close(done) - } - } - }) - - if err := chromedp.Run(ctx, chromedp.Navigate(requestUrl)); err != nil { - log.Printf("Error navigating to %s: %v", requestUrl, err) - return "", err - } - - // wait for image request to finish - <-done - //log.Printf("Done retrieving image from %s", requestUrl) - - var buf []byte - if err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error { - var err error - buf, err = network.GetResponseBody(requestID).Do(ctx) - if err != nil { - log.Printf("Error getting response body for %s: %v", requestUrl, err) - } - return err - })); err != nil { - return "", err - } - - encoded := base64.StdEncoding.EncodeToString(buf) - // get response body - return encoded, nil -} diff --git a/uploader/pipelines/trends.go b/uploader/pipelines/trends.go new file mode 100644 index 0000000..caa033c --- /dev/null +++ b/uploader/pipelines/trends.go @@ -0,0 +1,149 @@ +package pipelines + +import ( + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" +) + +// TrendsCourseSectionsPipeline links course documents to their section records for trends-specific aggregation. +var TrendsCourseSectionsPipeline = mongo.Pipeline{ + bson.D{ + {Key: "$lookup", + Value: bson.D{ + {Key: "from", Value: "sections"}, + {Key: "localField", Value: "sections"}, + {Key: "foreignField", Value: "_id"}, + {Key: "as", Value: "sections"}, + }, + }, + }, + bson.D{ + {Key: "$project", + Value: bson.D{ + {Key: "subject_prefix", Value: 1}, + {Key: "course_number", Value: 1}, + {Key: "sections", Value: 1}, + }, + }, + }, + bson.D{ + {Key: "$unwind", + Value: bson.D{ + {Key: "path", Value: "$sections"}, + {Key: "preserveNullAndEmptyArrays", Value: false}, + }, + }, + }, + bson.D{ + {Key: "$group", + Value: bson.D{ + {Key: "_id", + Value: bson.D{ + {Key: "$concat", + Value: bson.A{ + "$subject_prefix", + "$course_number", + }, + }, + }, + }, + {Key: "sections", Value: bson.D{{Key: "$addToSet", Value: "$sections"}}}, + }, + }, + }, +} + +// TrendsProfSectionsPipeline denormalizes professor records with their taught sections for trends-specific aggregation. +var TrendsProfSectionsPipeline = mongo.Pipeline{ + bson.D{ + {Key: "$lookup", + Value: bson.D{ + {Key: "from", Value: "sections"}, + {Key: "localField", Value: "sections"}, + {Key: "foreignField", Value: "_id"}, + {Key: "as", Value: "sections"}, + }, + }, + }, + bson.D{ + {Key: "$project", + Value: bson.D{ + {Key: "first_name", Value: 1}, + {Key: "last_name", Value: 1}, + {Key: "sections", Value: 1}, + }, + }, + }, +} + +// TrendsCourseProfSectionsPipeline links combination of professor and course to the sections for trends-specific aggregation. +var TrendsCourseProfSectionsPipeline = mongo.Pipeline{ + bson.D{ + {Key: "$lookup", + Value: bson.D{ + {Key: "from", Value: "sections"}, + {Key: "localField", Value: "sections"}, + {Key: "foreignField", Value: "_id"}, + {Key: "as", Value: "sections"}, + }, + }, + }, + bson.D{ + {Key: "$project", + Value: bson.D{ + {Key: "subject_prefix", Value: 1}, + {Key: "course_number", Value: 1}, + {Key: "sections", Value: 1}, + }, + }, + }, + + bson.D{ + {Key: "$unwind", + Value: bson.D{ + {Key: "path", Value: "$sections"}, + {Key: "preserveNullAndEmptyArrays", Value: false}, + }, + }, + }, + bson.D{ + {Key: "$lookup", + Value: bson.D{ + {Key: "from", Value: "professors"}, + {Key: "localField", Value: "sections.professors"}, + {Key: "foreignField", Value: "_id"}, + {Key: "as", Value: "professors"}, + }, + }, + }, + bson.D{ + {Key: "$unwind", + Value: bson.D{ + {Key: "path", Value: "$professors"}, + {Key: "preserveNullAndEmptyArrays", Value: false}, + }, + }, + }, + + bson.D{ + {Key: "$group", + Value: bson.D{ + {Key: "_id", + Value: bson.D{ + {Key: "$concat", + Value: bson.A{ + "$subject_prefix", + "$course_number", + " ", + "$professors.first_name", + " ", + "$professors.last_name", + }, + }, + }, + }, + {Key: "sections", Value: bson.D{{Key: "$addToSet", Value: "$sections"}}}, + }, + }, + }, +} diff --git a/uploader/pipelines/trends_course_and_prof_section.go b/uploader/pipelines/trends_course_and_prof_section.go deleted file mode 100644 index fc83589..0000000 --- a/uploader/pipelines/trends_course_and_prof_section.go +++ /dev/null @@ -1,77 +0,0 @@ -package pipelines - -import ( - "go.mongodb.org/mongo-driver/bson" - "go.mongodb.org/mongo-driver/mongo" -) - -var TrendsCourseProfSectionsPipeline = mongo.Pipeline{ - bson.D{ - {Key: "$lookup", - Value: bson.D{ - {Key: "from", Value: "sections"}, - {Key: "localField", Value: "sections"}, - {Key: "foreignField", Value: "_id"}, - {Key: "as", Value: "sections"}, - }, - }, - }, - bson.D{ - {Key: "$project", - Value: bson.D{ - {Key: "subject_prefix", Value: 1}, - {Key: "course_number", Value: 1}, - {Key: "sections", Value: 1}, - }, - }, - }, - - bson.D{ - {Key: "$unwind", - Value: bson.D{ - {Key: "path", Value: "$sections"}, - {Key: "preserveNullAndEmptyArrays", Value: false}, - }, - }, - }, - bson.D{ - {Key: "$lookup", - Value: bson.D{ - {Key: "from", Value: "professors"}, - {Key: "localField", Value: "sections.professors"}, - {Key: "foreignField", Value: "_id"}, - {Key: "as", Value: "professors"}, - }, - }, - }, - bson.D{ - {Key: "$unwind", - Value: bson.D{ - {Key: "path", Value: "$professors"}, - {Key: "preserveNullAndEmptyArrays", Value: false}, - }, - }, - }, - - bson.D{ - {Key: "$group", - Value: bson.D{ - {Key: "_id", - Value: bson.D{ - {Key: "$concat", - Value: bson.A{ - "$subject_prefix", - "$course_number", - " ", - "$professors.first_name", - " ", - "$professors.last_name", - }, - }, - }, - }, - {Key: "sections", Value: bson.D{{Key: "$addToSet", Value: "$sections"}}}, - }, - }, - }, -} diff --git a/uploader/pipelines/trends_course_sections.go b/uploader/pipelines/trends_course_sections.go deleted file mode 100644 index 87fce4d..0000000 --- a/uploader/pipelines/trends_course_sections.go +++ /dev/null @@ -1,54 +0,0 @@ -package pipelines - -import ( - "go.mongodb.org/mongo-driver/bson" - "go.mongodb.org/mongo-driver/mongo" -) - -// TrendsCourseSectionsPipeline links course documents to their section records for trend reporting. -var TrendsCourseSectionsPipeline = mongo.Pipeline{ - bson.D{ - {Key: "$lookup", - Value: bson.D{ - {Key: "from", Value: "sections"}, - {Key: "localField", Value: "sections"}, - {Key: "foreignField", Value: "_id"}, - {Key: "as", Value: "sections"}, - }, - }, - }, - bson.D{ - {Key: "$project", - Value: bson.D{ - {Key: "subject_prefix", Value: 1}, - {Key: "course_number", Value: 1}, - {Key: "sections", Value: 1}, - }, - }, - }, - bson.D{ - {Key: "$unwind", - Value: bson.D{ - {Key: "path", Value: "$sections"}, - {Key: "preserveNullAndEmptyArrays", Value: false}, - }, - }, - }, - bson.D{ - {Key: "$group", - Value: bson.D{ - {Key: "_id", - Value: bson.D{ - {Key: "$concat", - Value: bson.A{ - "$subject_prefix", - "$course_number", - }, - }, - }, - }, - {Key: "sections", Value: bson.D{{Key: "$addToSet", Value: "$sections"}}}, - }, - }, - }, -} diff --git a/uploader/pipelines/trends_prof_sections.go b/uploader/pipelines/trends_prof_sections.go deleted file mode 100644 index 2961c97..0000000 --- a/uploader/pipelines/trends_prof_sections.go +++ /dev/null @@ -1,29 +0,0 @@ -package pipelines - -import ( - "go.mongodb.org/mongo-driver/bson" - "go.mongodb.org/mongo-driver/mongo" -) - -// TrendsProfSectionsPipeline denormalizes professor records with their taught sections for analytics. -var TrendsProfSectionsPipeline = mongo.Pipeline{ - bson.D{ - {Key: "$lookup", - Value: bson.D{ - {Key: "from", Value: "sections"}, - {Key: "localField", Value: "sections"}, - {Key: "foreignField", Value: "_id"}, - {Key: "as", Value: "sections"}, - }, - }, - }, - bson.D{ - {Key: "$project", - Value: bson.D{ - {Key: "first_name", Value: 1}, - {Key: "last_name", Value: 1}, - {Key: "sections", Value: 1}, - }, - }, - }, -}