main
1package flux
2
3import (
4 "bufio"
5 "context"
6 "fmt"
7 "os"
8 "os/exec"
9 "path/filepath"
10 "regexp"
11 "strings"
12 "time"
13)
14
15// GitLogSource scans a git repository for new/modified HTML files.
16type GitLogSource struct {
17 RepoPath string // path to git repo
18 BaseURL string // e.g. https://vincent.demeester.fr
19
20 // Exclude patterns — files matching these are skipped
21 Exclude []string
22}
23
24func (g *GitLogSource) Name() string { return "gitlog" }
25
26// Fetch scans git log for HTML file changes since the given time.
27func (g *GitLogSource) Fetch(ctx context.Context, since time.Time) ([]Entry, error) {
28 args := []string{"log", "--format=COMMIT %H %aI", "--name-status", "--diff-filter=AM", "--", "*.html"}
29 if !since.IsZero() {
30 args = append(args[:2], append([]string{"--since=" + since.Format("2006-01-02")}, args[2:]...)...)
31 }
32
33 cmd := exec.CommandContext(ctx, "git", args...)
34 cmd.Dir = g.RepoPath
35
36 out, err := cmd.Output()
37 if err != nil {
38 return nil, fmt.Errorf("git log: %w", err)
39 }
40
41 return g.parseLog(string(out))
42}
43
44// parseLog parses the git log output into entries.
45// We track files: first appearance = page-new, subsequent = page-updated.
46// Only the most recent event per file is kept.
47func (g *GitLogSource) parseLog(output string) ([]Entry, error) {
48 type fileEvent struct {
49 file string
50 date time.Time
51 commit string
52 status string // A or M
53 }
54
55 var events []fileEvent
56 var currentDate time.Time
57 var currentCommit string
58
59 scanner := bufio.NewScanner(strings.NewReader(output))
60 for scanner.Scan() {
61 line := scanner.Text()
62 if strings.HasPrefix(line, "COMMIT ") {
63 parts := strings.SplitN(line, " ", 3)
64 if len(parts) == 3 {
65 currentCommit = parts[1]
66 t, err := time.Parse(time.RFC3339, parts[2])
67 if err == nil {
68 currentDate = t
69 }
70 }
71 continue
72 }
73
74 // Tab-separated: status\tfilename
75 if len(line) >= 3 && (line[0] == 'A' || line[0] == 'M') && line[1] == '\t' {
76 file := line[2:]
77 if g.shouldSkip(file) {
78 continue
79 }
80 events = append(events, fileEvent{
81 file: file,
82 date: currentDate,
83 commit: currentCommit,
84 status: string(line[0]),
85 })
86 }
87 }
88
89 // Track first seen per file (earliest commit = "added")
90 firstSeen := make(map[string]string) // file → commit where first added
91 // Walk in reverse (oldest first) to find true first appearance
92 for i := len(events) - 1; i >= 0; i-- {
93 e := events[i]
94 if _, ok := firstSeen[e.file]; !ok {
95 firstSeen[e.file] = e.commit
96 }
97 }
98
99 // Keep only the most recent event per file
100 seen := make(map[string]bool)
101 var entries []Entry
102 for _, ev := range events {
103 if seen[ev.file] {
104 continue
105 }
106 seen[ev.file] = true
107
108 kind := KindPageUpdated
109 if ev.commit == firstSeen[ev.file] || ev.status == "A" {
110 kind = KindPageNew
111 }
112
113 title := g.extractTitle(ev.file)
114 url := g.fileToURL(ev.file)
115 id := fmt.Sprintf("%s-%s", kind, ev.file)
116
117 entries = append(entries, Entry{
118 ID: id,
119 Kind: kind,
120 Title: title,
121 URL: url,
122 Date: ev.date,
123 Source: "gitlog",
124 Metadata: map[string]string{
125 "file": ev.file,
126 "commit": ev.commit[:8],
127 },
128 })
129 }
130
131 return entries, nil
132}
133
134// shouldSkip returns true if the file should be excluded.
135// We skip infrastructure/generated files and keep all content.
136func (g *GitLogSource) shouldSkip(file string) bool {
137 base := filepath.Base(file)
138
139 // Skip generated output directories
140 for _, prefix := range []string{"build/", "site/", ".soupault-cache/", "flux/", "flux-design/"} {
141 if strings.HasPrefix(file, prefix) {
142 return true
143 }
144 }
145
146 // Skip root-level infrastructure files
147 switch file {
148 case "index.html", // root index (has FLUX_SNIPPET placeholder)
149 "tufte.html", // CSS demo
150 "css.html", // CSS demo
151 "custom.css": // not even HTML
152 return true
153 }
154
155 // Skip sitemap files anywhere
156 if base == "sitemap.html" {
157 return true
158 }
159
160 // Skip embedded data/archive files (org-mode link captures)
161 if strings.Contains(file, "/data/") {
162 return true
163 }
164
165 // Skip fancyindex assets
166 if strings.Contains(file, ".fancyindex/") {
167 return true
168 }
169
170 // User-configured excludes
171 for _, pattern := range g.Exclude {
172 if matched, _ := filepath.Match(pattern, file); matched {
173 return true
174 }
175 if matched, _ := filepath.Match(pattern, base); matched {
176 return true
177 }
178 }
179
180 return false
181}
182
183var titleRe = regexp.MustCompile(`<title[^>]*>([^<]+)</title>`)
184
185// extractTitle reads the HTML file and extracts the <title> content.
186func (g *GitLogSource) extractTitle(file string) string {
187 path := filepath.Join(g.RepoPath, file)
188 data, err := os.ReadFile(path)
189 if err != nil {
190 // File might have been deleted; derive from filename
191 return titleFromFilename(file)
192 }
193
194 matches := titleRe.FindSubmatch(data)
195 if matches != nil {
196 title := strings.TrimSpace(string(matches[1]))
197 // Strip common suffixes
198 for _, suffix := range []string{" — Vincent Demeester", " - Vincent Demeester"} {
199 title = strings.TrimSuffix(title, suffix)
200 }
201 if title != "" {
202 return title
203 }
204 }
205
206 return titleFromFilename(file)
207}
208
209// titleFromFilename derives a human title from a file path.
210func titleFromFilename(file string) string {
211 base := filepath.Base(file)
212 base = strings.TrimSuffix(base, ".html")
213 // Remove date prefix like 2020-04-15-
214 if len(base) > 11 && base[4] == '-' && base[7] == '-' && base[10] == '-' {
215 base = base[11:]
216 }
217 base = strings.ReplaceAll(base, "-", " ")
218 base = strings.ReplaceAll(base, "_", " ")
219 // Capitalize first letter
220 if len(base) > 0 {
221 base = strings.ToUpper(base[:1]) + base[1:]
222 }
223 return base
224}
225
226// fileToURL converts a file path to a URL.
227func (g *GitLogSource) fileToURL(file string) string {
228 return g.BaseURL + "/" + file
229}