gitlog.go

  1package flux
  2
  3import (
  4	"bufio"
  5	"context"
  6	"fmt"
  7	"os"
  8	"os/exec"
  9	"path/filepath"
 10	"regexp"
 11	"strings"
 12	"time"
 13)
 14
 15// GitLogSource scans a git repository for new/modified HTML files.
 16type GitLogSource struct {
 17	RepoPath string // path to git repo
 18	BaseURL  string // e.g. https://vincent.demeester.fr
 19
 20	// Exclude patterns — files matching these are skipped
 21	Exclude []string
 22}
 23
 24func (g *GitLogSource) Name() string { return "gitlog" }
 25
 26// Fetch scans git log for HTML file changes since the given time.
 27func (g *GitLogSource) Fetch(ctx context.Context, since time.Time) ([]Entry, error) {
 28	args := []string{"log", "--format=COMMIT %H %aI", "--name-status", "--diff-filter=AM", "--", "*.html"}
 29	if !since.IsZero() {
 30		args = append(args[:2], append([]string{"--since=" + since.Format("2006-01-02")}, args[2:]...)...)
 31	}
 32
 33	cmd := exec.CommandContext(ctx, "git", args...)
 34	cmd.Dir = g.RepoPath
 35
 36	out, err := cmd.Output()
 37	if err != nil {
 38		return nil, fmt.Errorf("git log: %w", err)
 39	}
 40
 41	return g.parseLog(string(out))
 42}
 43
 44// parseLog parses the git log output into entries.
 45// We track files: first appearance = page-new, subsequent = page-updated.
 46// Only the most recent event per file is kept.
 47func (g *GitLogSource) parseLog(output string) ([]Entry, error) {
 48	type fileEvent struct {
 49		file   string
 50		date   time.Time
 51		commit string
 52		status string // A or M
 53	}
 54
 55	var events []fileEvent
 56	var currentDate time.Time
 57	var currentCommit string
 58
 59	scanner := bufio.NewScanner(strings.NewReader(output))
 60	for scanner.Scan() {
 61		line := scanner.Text()
 62		if strings.HasPrefix(line, "COMMIT ") {
 63			parts := strings.SplitN(line, " ", 3)
 64			if len(parts) == 3 {
 65				currentCommit = parts[1]
 66				t, err := time.Parse(time.RFC3339, parts[2])
 67				if err == nil {
 68					currentDate = t
 69				}
 70			}
 71			continue
 72		}
 73
 74		// Tab-separated: status\tfilename
 75		if len(line) >= 3 && (line[0] == 'A' || line[0] == 'M') && line[1] == '\t' {
 76			file := line[2:]
 77			if g.shouldSkip(file) {
 78				continue
 79			}
 80			events = append(events, fileEvent{
 81				file:   file,
 82				date:   currentDate,
 83				commit: currentCommit,
 84				status: string(line[0]),
 85			})
 86		}
 87	}
 88
 89	// Track first seen per file (earliest commit = "added")
 90	firstSeen := make(map[string]string) // file → commit where first added
 91	// Walk in reverse (oldest first) to find true first appearance
 92	for i := len(events) - 1; i >= 0; i-- {
 93		e := events[i]
 94		if _, ok := firstSeen[e.file]; !ok {
 95			firstSeen[e.file] = e.commit
 96		}
 97	}
 98
 99	// Keep only the most recent event per file
100	seen := make(map[string]bool)
101	var entries []Entry
102	for _, ev := range events {
103		if seen[ev.file] {
104			continue
105		}
106		seen[ev.file] = true
107
108		kind := KindPageUpdated
109		if ev.commit == firstSeen[ev.file] || ev.status == "A" {
110			kind = KindPageNew
111		}
112
113		title := g.extractTitle(ev.file)
114		url := g.fileToURL(ev.file)
115		id := fmt.Sprintf("%s-%s", kind, ev.file)
116
117		entries = append(entries, Entry{
118			ID:     id,
119			Kind:   kind,
120			Title:  title,
121			URL:    url,
122			Date:   ev.date,
123			Source: "gitlog",
124			Metadata: map[string]string{
125				"file":   ev.file,
126				"commit": ev.commit[:8],
127			},
128		})
129	}
130
131	return entries, nil
132}
133
134// shouldSkip returns true if the file should be excluded.
135// We skip infrastructure/generated files and keep all content.
136func (g *GitLogSource) shouldSkip(file string) bool {
137	base := filepath.Base(file)
138
139	// Skip generated output directories
140	for _, prefix := range []string{"build/", "site/", ".soupault-cache/", "flux/", "flux-design/"} {
141		if strings.HasPrefix(file, prefix) {
142			return true
143		}
144	}
145
146	// Skip root-level infrastructure files
147	switch file {
148	case "index.html",  // root index (has FLUX_SNIPPET placeholder)
149		"tufte.html",  // CSS demo
150		"css.html",    // CSS demo
151		"custom.css":  // not even HTML
152		return true
153	}
154
155	// Skip sitemap files anywhere
156	if base == "sitemap.html" {
157		return true
158	}
159
160	// Skip embedded data/archive files (org-mode link captures)
161	if strings.Contains(file, "/data/") {
162		return true
163	}
164
165	// Skip fancyindex assets
166	if strings.Contains(file, ".fancyindex/") {
167		return true
168	}
169
170	// User-configured excludes
171	for _, pattern := range g.Exclude {
172		if matched, _ := filepath.Match(pattern, file); matched {
173			return true
174		}
175		if matched, _ := filepath.Match(pattern, base); matched {
176			return true
177		}
178	}
179
180	return false
181}
182
183var titleRe = regexp.MustCompile(`<title[^>]*>([^<]+)</title>`)
184
185// extractTitle reads the HTML file and extracts the <title> content.
186func (g *GitLogSource) extractTitle(file string) string {
187	path := filepath.Join(g.RepoPath, file)
188	data, err := os.ReadFile(path)
189	if err != nil {
190		// File might have been deleted; derive from filename
191		return titleFromFilename(file)
192	}
193
194	matches := titleRe.FindSubmatch(data)
195	if matches != nil {
196		title := strings.TrimSpace(string(matches[1]))
197		// Strip common suffixes
198		for _, suffix := range []string{" — Vincent Demeester", " - Vincent Demeester"} {
199			title = strings.TrimSuffix(title, suffix)
200		}
201		if title != "" {
202			return title
203		}
204	}
205
206	return titleFromFilename(file)
207}
208
209// titleFromFilename derives a human title from a file path.
210func titleFromFilename(file string) string {
211	base := filepath.Base(file)
212	base = strings.TrimSuffix(base, ".html")
213	// Remove date prefix like 2020-04-15-
214	if len(base) > 11 && base[4] == '-' && base[7] == '-' && base[10] == '-' {
215		base = base[11:]
216	}
217	base = strings.ReplaceAll(base, "-", " ")
218	base = strings.ReplaceAll(base, "_", " ")
219	// Capitalize first letter
220	if len(base) > 0 {
221		base = strings.ToUpper(base[:1]) + base[1:]
222	}
223	return base
224}
225
226// fileToURL converts a file path to a URL.
227func (g *GitLogSource) fileToURL(file string) string {
228	return g.BaseURL + "/" + file
229}