Index and expose in API symbol metadata (#18) · boltless.me/zoekt@11d1372

+9

api.go

··· 77 77 LineFragments []LineFragmentMatch 78 78 } 79 79 80 + type Symbol struct { 81 + Sym string 82 + Kind string 83 + Parent string 84 + ParentKind string 85 + } 86 + 80 87 // LineFragmentMatch a segment of matching text within a line. 81 88 type LineFragmentMatch struct { 82 89 // Offset within the line, in bytes. ··· 87 94 88 95 // Number bytes that match. 89 96 MatchLength int 97 + 98 + SymbolInfo *Symbol 90 99 } 91 100 92 101 // Stats contains interesting numbers on the search

+14 -5

build/ctags.go

··· 154 154 } 155 155 doc.Language = strings.ToLower(es[0].Language) 156 156 157 - symOffsets, err := tagsToSections(doc.Content, es) 157 + symOffsets, symMetaData, err := tagsToSections(doc.Content, es) 158 158 if err != nil { 159 159 return fmt.Errorf("%s: %v", doc.Name, err) 160 160 } 161 161 doc.Symbols = symOffsets 162 + doc.SymbolsMetaData = symMetaData 162 163 } 163 164 164 165 return nil ··· 198 199 } 199 200 200 201 for k, tags := range fileTags { 201 - symOffsets, err := tagsToSections(contents[k], tags) 202 + symOffsets, symMetaData, err := tagsToSections(contents[k], tags) 202 203 if err != nil { 203 204 return fmt.Errorf("%s: %v", k, err) 204 205 } 205 206 todo[pathIndices[k]].Symbols = symOffsets 207 + todo[pathIndices[k]].SymbolsMetaData = symMetaData 206 208 if len(tags) > 0 { 207 209 todo[pathIndices[k]].Language = strings.ToLower(tags[0].Language) 208 210 } ··· 210 212 return nil 211 213 } 212 214 213 - func tagsToSections(content []byte, tags []*ctags.Entry) ([]zoekt.DocumentSection, error) { 215 + func tagsToSections(content []byte, tags []*ctags.Entry) ([]zoekt.DocumentSection, []*zoekt.Symbol, error) { 214 216 nls := newLinesIndices(content) 215 217 nls = append(nls, uint32(len(content))) 216 218 var symOffsets []zoekt.DocumentSection 219 + var symMetaData []*zoekt.Symbol 217 220 var lastEnd uint32 218 221 var lastLine int 219 222 var lastIntraEnd int ··· 224 227 } 225 228 lineIdx := t.Line - 1 226 229 if lineIdx >= len(nls) { 227 - return nil, fmt.Errorf("linenum for entry out of range %v", t) 230 + return nil, nil, fmt.Errorf("linenum for entry out of range %v", t) 228 231 } 229 232 230 233 lineOff := uint32(0) ··· 259 262 Start: start, 260 263 End: endSym, 261 264 }) 265 + symMetaData = append(symMetaData, &zoekt.Symbol{ 266 + Sym: t.Sym, 267 + Kind: t.Kind, 268 + Parent: t.Parent, 269 + ParentKind: t.ParentKind, 270 + }) 262 271 lastEnd = endSym 263 272 lastLine = lineIdx 264 273 lastIntraEnd = intraOff + len(t.Sym) 265 274 } 266 275 267 - return symOffsets, nil 276 + return symOffsets, symMetaData, nil 268 277 } 269 278 270 279 func newLinesIndices(in []byte) []uint32 {

+3 -3

build/ctags_test.go

··· 32 32 Line: 2, 33 33 }} 34 34 35 - secs, err := tagsToSections(c, tags) 35 + secs, _, err := tagsToSections(c, tags) 36 36 if err != nil { 37 37 t.Fatal("tagsToSections", err) 38 38 } ··· 57 57 }, 58 58 } 59 59 60 - got, err := tagsToSections(c, tags) 60 + got, _, err := tagsToSections(c, tags) 61 61 if err != nil { 62 62 t.Fatal("tagsToSections", err) 63 63 } ··· 81 81 Line: 2, 82 82 }} 83 83 84 - secs, err := tagsToSections(c, tags) 84 + secs, _, err := tagsToSections(c, tags) 85 85 if err != nil { 86 86 t.Fatal("tagsToSections", err) 87 87 }

+1

contentprovider.go

··· 218 218 Offset: m.byteOffset, 219 219 LineOffset: int(m.byteOffset) - lineStart, 220 220 MatchLength: int(m.byteMatchSz), 221 + SymbolInfo: m.symbolInfo, 221 222 } 222 223 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) 223 224 }

+7 -5

ctags/json.go

··· 180 180 } 181 181 182 182 e := Entry{ 183 - Sym: rep.Name, 184 - Path: rep.Path, 185 - Line: rep.Line, 186 - Kind: rep.Kind, 187 - Language: rep.Language, 183 + Sym: rep.Name, 184 + Path: rep.Path, 185 + Parent: rep.Scope, 186 + ParentKind: rep.ScopeKind, 187 + Line: rep.Line, 188 + Kind: rep.Kind, 189 + Language: rep.Language, 188 190 } 189 191 190 192 es = append(es, &e)

+28 -20

ctags/json_test.go

··· 69 69 }, 70 70 71 71 { 72 - Sym: "BLA", 73 - Path: "io/zoekt/Back.java", 74 - Line: 5, 75 - Kind: "field", 76 - Language: "Java", 72 + Sym: "BLA", 73 + Path: "io/zoekt/Back.java", 74 + Line: 5, 75 + Kind: "field", 76 + Language: "Java", 77 + Parent: "Back", 78 + ParentKind: "class", 77 79 }, 78 80 { 79 - Sym: "member", 80 - Path: "io/zoekt/Back.java", 81 - Line: 6, 82 - Language: "Java", 83 - Kind: "field", 81 + Sym: "member", 82 + Path: "io/zoekt/Back.java", 83 + Line: 6, 84 + Language: "Java", 85 + Kind: "field", 86 + Parent: "Back", 87 + ParentKind: "class", 84 88 }, 85 89 { 86 - Sym: "Back", 87 - Path: "io/zoekt/Back.java", 88 - Language: "Java", 89 - Line: 7, 90 - Kind: "method", 90 + Sym: "Back", 91 + Path: "io/zoekt/Back.java", 92 + Language: "Java", 93 + Line: 7, 94 + Kind: "method", 95 + Parent: "Back", 96 + ParentKind: "class", 91 97 }, 92 98 { 93 - Sym: "method", 94 - Language: "Java", 95 - Path: "io/zoekt/Back.java", 96 - Line: 10, 97 - Kind: "method", 99 + Sym: "method", 100 + Language: "Java", 101 + Path: "io/zoekt/Back.java", 102 + Line: 10, 103 + Kind: "method", 104 + Parent: "Back", 105 + ParentKind: "class", 98 106 }, 99 107 } 100 108

+2 -2

ctags/parse.go

··· 27 27 Kind string 28 28 Language string 29 29 Parent string 30 - ParentType string 30 + ParentKind string 31 31 32 32 FileLimited bool 33 33 } ··· 64 64 for _, p := range []string{"class", "enum"} { 65 65 if strings.HasPrefix(f, p+":") { 66 66 e.Parent = strings.TrimPrefix(f, p+":") 67 - e.ParentType = p 67 + e.ParentKind = p 68 68 continue field 69 69 } 70 70 }

+2 -2

ctags/parse_test.go

··· 32 32 Line: 59, 33 33 Kind: "e", 34 34 Parent: "CommitData.Field", 35 - ParentType: "enum", 35 + ParentKind: "enum", 36 36 FileLimited: true, 37 37 }, 38 38 }, ··· 43 43 Line: 55, 44 44 Kind: "f", 45 45 Parent: "BaseServlet", 46 - ParentType: "class", 46 + ParentKind: "class", 47 47 FileLimited: true, 48 48 }, 49 49 },

+56 -6

indexbuilder.go

··· 152 152 docSections [][]DocumentSection 153 153 runeDocSections []DocumentSection 154 154 155 + symID uint32 156 + symIndex map[string]uint32 157 + symKindID uint32 158 + symKindIndex map[string]uint32 159 + symMetaData []uint32 160 + 161 + fileEndSymbol []uint32 162 + 155 163 checksums []byte 156 164 157 165 branchMasks []uint64 ··· 195 203 b := &IndexBuilder{ 196 204 contentPostings: newPostingsBuilder(), 197 205 namePostings: newPostingsBuilder(), 206 + fileEndSymbol: []uint32{0}, 207 + symIndex: make(map[string]uint32), 208 + symKindIndex: make(map[string]uint32), 198 209 languageMap: map[string]byte{}, 199 210 } 200 211 ··· 258 269 SkipReason string 259 270 260 271 // Document sections for symbols. Offsets should use bytes. 261 - Symbols []DocumentSection 272 + Symbols []DocumentSection 273 + SymbolsMetaData []*Symbol 274 + } 275 + 276 + type symbolSlice struct { 277 + symbols []DocumentSection 278 + metaData []*Symbol 262 279 } 263 280 264 - type docSectionSlice []DocumentSection 281 + func (s symbolSlice) Len() int { return len(s.symbols) } 265 282 266 - func (m docSectionSlice) Len() int { return len(m) } 267 - func (m docSectionSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 268 - func (m docSectionSlice) Less(i, j int) bool { return m[i].Start < m[j].Start } 283 + func (s symbolSlice) Swap(i, j int) { 284 + s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i] 285 + s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i] 286 + } 287 + 288 + func (s symbolSlice) Less(i, j int) bool { 289 + return s.symbols[i].Start < s.symbols[j].Start 290 + } 269 291 270 292 // AddFile is a convenience wrapper for Add 271 293 func (b *IndexBuilder) AddFile(name string, content []byte) error { ··· 327 349 328 350 const notIndexedMarker = "NOT-INDEXED: " 329 351 352 + func (b *IndexBuilder) symbolID(sym string) uint32 { 353 + if _, ok := b.symIndex[sym]; !ok { 354 + b.symIndex[sym] = b.symID 355 + b.symID++ 356 + } 357 + return b.symIndex[sym] 358 + } 359 + 360 + func (b *IndexBuilder) symbolKindID(t string) uint32 { 361 + if _, ok := b.symKindIndex[t]; !ok { 362 + b.symKindIndex[t] = b.symKindID 363 + b.symKindID++ 364 + } 365 + return b.symKindIndex[t] 366 + } 367 + 368 + func (b *IndexBuilder) addSymbols(symbols []*Symbol) { 369 + for _, sym := range symbols { 370 + b.symMetaData = append(b.symMetaData, 371 + b.symbolID(sym.Sym), 372 + b.symbolKindID(sym.Kind), 373 + b.symbolID(sym.Parent), 374 + b.symbolKindID(sym.ParentKind)) 375 + } 376 + b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.symMetaData)/4)) 377 + } 378 + 330 379 // Add a file which only occurs in certain branches. 331 380 func (b *IndexBuilder) Add(doc Document) error { 332 381 hasher := crc64.New(crc64.MakeTable(crc64.ISO)) ··· 344 393 } 345 394 } 346 395 347 - sort.Sort(docSectionSlice(doc.Symbols)) 396 + sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 348 397 var last DocumentSection 349 398 for i, s := range doc.Symbols { 350 399 if i > 0 { ··· 357 406 if last.End > uint32(len(doc.Content)) { 358 407 return fmt.Errorf("section goes past end of content") 359 408 } 409 + b.addSymbols(doc.SymbolsMetaData) 360 410 361 411 if doc.SubRepositoryPath != "" { 362 412 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name)

+42

indexdata.go

··· 15 15 package zoekt 16 16 17 17 import ( 18 + "encoding/binary" 18 19 "fmt" 19 20 "hash/crc64" 20 21 "unicode/utf8" ··· 26 27 // in memory to search. Most of the memory is taken up by the ngram => 27 28 // offset index. 28 29 type indexData struct { 30 + symbolData 31 + 29 32 file IndexFile 30 33 31 34 ngrams map[ngram]simpleSection ··· 51 54 fileNameContent []byte 52 55 fileNameIndex []uint32 53 56 fileNameNgrams map[ngram][]uint32 57 + 58 + // fileEndSymbol[i] is the index of the first symbol for document i. 59 + fileEndSymbol []uint32 54 60 55 61 // rune offset=>byte offset mapping, relative to the start of the filename corpus 56 62 fileNameRuneOffsets []uint32 ··· 82 88 languageMap map[byte]string 83 89 84 90 repoListEntry RepoListEntry 91 + } 92 + 93 + type symbolData struct { 94 + // symContent stores Symbol.Sym and Symbol.Parent. 95 + // TODO we don't need to store Symbol.Sym. 96 + symContent []byte 97 + symIndex []uint32 98 + // symKindContent is an enum of sym.Kind and sym.ParentKind 99 + symKindContent []byte 100 + symKindIndex []uint32 101 + //symMetadata is [4]uint32 Sym Kind Parent ParentKind 102 + symMetaData []byte 103 + } 104 + 105 + // data returns the symbol at i 106 + func (d *symbolData) data(i uint32) *Symbol { 107 + size := uint32(4 * 4) // 4 uint32s 108 + offset := i * size 109 + if offset >= uint32(len(d.symMetaData)) { 110 + return nil 111 + } 112 + 113 + metadata := d.symMetaData[offset : offset+size] 114 + sym := &Symbol{} 115 + key := binary.BigEndian.Uint32(metadata) 116 + // TODO keeps these as bytes to avoid copy from mmap region. Only copy to 117 + // string when collecting matches. 118 + sym.Sym = string(d.symContent[d.symIndex[key]:d.symIndex[key+1]]) 119 + key = binary.BigEndian.Uint32(metadata[4:]) 120 + sym.Kind = string(d.symKindContent[d.symKindIndex[key]:d.symKindIndex[key+1]]) 121 + key = binary.BigEndian.Uint32(metadata[8:]) 122 + sym.Parent = string(d.symContent[d.symIndex[key]:d.symIndex[key+1]]) 123 + 124 + key = binary.BigEndian.Uint32(metadata[12:]) 125 + sym.ParentKind = string(d.symKindContent[d.symKindIndex[key]:d.symKindIndex[key+1]]) 126 + return sym 85 127 } 86 128 87 129 func (d *indexData) getChecksum(idx uint32) []byte {

+2

matchiter.go

··· 34 34 runeOffset uint32 35 35 byteOffset uint32 36 36 byteMatchSz uint32 37 + 38 + symbolInfo *Symbol 37 39 } 38 40 39 41 // Matches content against the substring, and populates byteMatchSz on success

+13 -4

matchtree.go

··· 170 170 content := cp.data(false) 171 171 172 172 found := t.found[:0] 173 - for _, sec := range sections { 173 + for i, sec := range sections { 174 174 idx := t.regexp.FindIndex(content[sec.Start:sec.End]) 175 175 if idx == nil { 176 176 continue 177 177 } 178 + 179 + secID := cp.id.fileEndSymbol[cp.idx] + uint32(i) 178 180 cm := &candidateMatch{ 179 181 byteOffset: sec.Start + uint32(idx[0]), 180 182 byteMatchSz: uint32(idx[1] - idx[0]), 183 + symbolInfo: cp.id.symbolData.data(secID), 181 184 } 182 - 183 185 found = append(found, cm) 184 186 } 185 187 t.found = found ··· 194 196 patternSize uint32 195 197 fileEndRunes []uint32 196 198 197 - doc uint32 198 - sections []DocumentSection 199 + doc uint32 200 + sections []DocumentSection 201 + symbolData *symbolData 202 + secID uint32 199 203 } 200 204 201 205 func (t *symbolSubstrMatchTree) prepare(doc uint32) { ··· 209 213 210 214 for len(t.sections) > 0 && t.sections[0].Start < fileStart { 211 215 t.sections = t.sections[1:] 216 + t.secID++ 212 217 } 213 218 219 + // TODO we can use fileEndSymbol to skip 214 220 trimmed := t.current[:0] 215 221 for len(t.sections) > 0 && len(t.current) > 0 { 216 222 start := fileStart + t.current[0].runeOffset 217 223 end := start + t.patternSize 218 224 if start >= t.sections[0].End { 219 225 t.sections = t.sections[1:] 226 + t.secID++ 220 227 continue 221 228 } 222 229 ··· 226 233 } 227 234 228 235 if end <= t.sections[0].End { 236 + t.current[0].symbolInfo = t.symbolData.data(t.secID) 229 237 trimmed = append(trimmed, t.current[0]) 230 238 } 231 239 ··· 740 748 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 741 749 fileEndRunes: d.fileEndRunes, 742 750 sections: d.runeDocSections, 751 + symbolData: &d.symbolData, 743 752 }, nil 744 753 } 745 754

+23

read.go

··· 173 173 d.docSectionsStart = toc.fileSections.data.off 174 174 d.docSectionsIndex = toc.fileSections.relativeIndex() 175 175 176 + d.symIndex = toc.symbolMap.relativeIndex() 177 + d.symKindIndex = toc.symbolKindMap.relativeIndex() 178 + 176 179 d.checksums, err = d.readSectionBlob(toc.contentChecksums) 177 180 if err != nil { 178 181 return nil, err ··· 199 202 } 200 203 } 201 204 205 + d.fileEndSymbol, err = readSectionU32(d.file, toc.fileEndSymbol) 206 + if err != nil { 207 + return nil, err 208 + } 209 + 202 210 d.fileBranchMasks, err = readSectionU64(d.file, toc.branchMasks) 211 + if err != nil { 212 + return nil, err 213 + } 214 + 215 + d.symContent, err = d.readSectionBlob(toc.symbolMap.data) 216 + if err != nil { 217 + return nil, err 218 + } 219 + 220 + d.symKindContent, err = d.readSectionBlob(toc.symbolKindMap.data) 221 + if err != nil { 222 + return nil, err 223 + } 224 + 225 + d.symMetaData, err = d.readSectionBlob(toc.symbolMetaData) 203 226 if err != nil { 204 227 return nil, err 205 228 }

+1

rpc/rpc.go

··· 129 129 gob.Register(&query.RepoSet{}) 130 130 gob.Register(&query.Repo{}) 131 131 gob.Register(&query.Substring{}) 132 + gob.Register(&query.Symbol{}) 132 133 gob.Register(&query.Type{}) 133 134 }) 134 135 }

+11 -1

toc.go

··· 27 27 // 13: content checksums 28 28 // 14: languages 29 29 // 15: rune based symbol sections 30 - const IndexFormatVersion = 15 30 + // 16: store ctags metadata 31 + const IndexFormatVersion = 16 31 32 32 33 // FeatureVersion is increased if a feature is added that requires reindexing data 33 34 // without changing the format version ··· 51 52 fileEndRunes simpleSection 52 53 languages simpleSection 53 54 55 + fileEndSymbol simpleSection 56 + symbolMap compoundSection 57 + symbolKindMap compoundSection 58 + symbolMetaData simpleSection 59 + 54 60 branchMasks simpleSection 55 61 subRepos simpleSection 56 62 ··· 73 79 &t.fileContents, 74 80 &t.fileNames, 75 81 &t.fileSections, 82 + &t.fileEndSymbol, 83 + &t.symbolMap, 84 + &t.symbolKindMap, 85 + &t.symbolMetaData, 76 86 &t.newlines, 77 87 &t.ngramText, 78 88 &t.postings,

+27

write.go

··· 39 39 s.end(w) 40 40 } 41 41 42 + func (s *compoundSection) writeMap(w *writer, m map[string]uint32) { 43 + keys := make([]*searchableString, 0, len(m)) 44 + for k, _ := range m { 45 + keys = append(keys, &searchableString{ 46 + data: []byte(k), 47 + }) 48 + } 49 + sort.Slice(keys, func(i, j int) bool { 50 + return m[string(keys[i].data)] < m[string(keys[j].data)] 51 + }) 52 + s.writeStrings(w, keys) 53 + } 54 + 42 55 func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection, 43 56 charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection) { 44 57 keys := make(ngramSlice, 0, len(s.postings)) ··· 83 96 toc.newlines.addItem(w, toSizedDeltas(newLinesIndices(f.data))) 84 97 } 85 98 toc.newlines.end(w) 99 + 100 + toc.fileEndSymbol.start(w) 101 + for _, m := range b.fileEndSymbol { 102 + w.U32(m) 103 + } 104 + toc.fileEndSymbol.end(w) 105 + 106 + toc.symbolMap.writeMap(w, b.symIndex) 107 + toc.symbolKindMap.writeMap(w, b.symKindIndex) 108 + toc.symbolMetaData.start(w) 109 + for _, m := range b.symMetaData { 110 + w.U32(m) 111 + } 112 + toc.symbolMetaData.end(w) 86 113 87 114 toc.branchMasks.start(w) 88 115 for _, m := range b.branchMasks {

Configure Feed

Configure Feed