···39394040 // Repository is the globally unique name of the repo of the
4141 // match
4242- Repository string
4343- Branches []string
4444- LineMatches []LineMatch
4242+ Repository string
4343+ Branches []string
4444+4545+ // One of LineMatches or ChunkMatches will be returned depending on whether
4646+ // the SearchOptions.ChunkMatches is set.
4747+ LineMatches []LineMatch
4848+ ChunkMatches []ChunkMatch
45494650 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in
4751 // Sourcegraph.
···70747175 // Commit SHA1 (hex) of the (sub)repo holding the file.
7276 Version string
7777+}
7878+7979+// ChunkMatch is a set of non-overlapping matches within a contiguous range of
8080+// lines in the file.
8181+type ChunkMatch struct {
8282+ // Content is a contiguous range of complete lines that fully contains Ranges.
8383+ Content []byte
8484+ // ContentStart is the location (inclusive) of the beginning of content
8585+ // relative to the beginning of the file. It will always be at the
8686+ // beginning of a line (Column will always be 1).
8787+ ContentStart Location
8888+8989+ // FileName indicates whether this match is a match on the file name, in
9090+ // which case Content will contain the file name.
9191+ FileName bool
9292+9393+ // Ranges is a set of matching ranges within this chunk. Each range is relative
9494+ // to the beginning of the file (not the beginning of Content).
9595+ Ranges []Range
9696+9797+ // SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
9898+ // its length will equal that of Ranges. Any of its elements may be nil.
9999+ SymbolInfo []*Symbol
100100+101101+ Score float64
102102+ DebugScore string
103103+}
104104+105105+type Range struct {
106106+ // The inclusive beginning of the range.
107107+ Start Location
108108+ // The exclusive end of the range.
109109+ End Location
110110+}
111111+112112+type Location struct {
113113+ // 0-based byte offset from the beginning of the file
114114+ ByteOffset uint32
115115+ // 1-based line number from the beginning of the file
116116+ LineNumber uint32
117117+ // 1-based column number (in runes) from the beginning of line
118118+ Column uint32
73119}
7412075121// LineMatch holds the matches within a single line in a file.
···544590 // Note that the included context lines might contain matches and
545591 // it's up to the consumer of the result to remove those lines.
546592 NumContextLines int
593593+594594+ // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
595595+ // EXPERIMENTAL: the behavior of this flag may be changed in future versions.
596596+ ChunkMatches bool
547597548598 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as
549599 // a command-line flag
+234
contentprovider.go
···163163 return result
164164}
165165166166+func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
167167+ var result []ChunkMatch
168168+ if ms[0].fileName {
169169+ // If the first match is a filename match, there will only be
170170+ // one match and the matched content will be the filename.
171171+172172+ fileName := p.id.fileName(p.idx)
173173+ ranges := make([]Range, 0, len(ms))
174174+ for _, m := range ms {
175175+ ranges = append(ranges, Range{
176176+ Start: Location{
177177+ ByteOffset: m.byteOffset,
178178+ LineNumber: 1,
179179+ Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
180180+ },
181181+ End: Location{
182182+ ByteOffset: m.byteOffset + m.byteMatchSz,
183183+ LineNumber: 1,
184184+ Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
185185+ },
186186+ })
187187+ }
188188+189189+ result = []ChunkMatch{{
190190+ Content: fileName,
191191+ ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
192192+ Ranges: ranges,
193193+ FileName: true,
194194+ }}
195195+ } else {
196196+ result = p.fillContentChunkMatches(ms, numContextLines)
197197+ }
198198+199199+ sects := p.docSections()
200200+ for i, m := range result {
201201+ result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
202202+ }
203203+204204+ return result
205205+}
206206+166207func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch {
167208 var result []LineMatch
168209 for len(ms) > 0 {
···241282 return result
242283}
243284285285+func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
286286+ newlines := p.newlines()
287287+ chunks := chunkCandidates(ms, newlines, numContextLines)
288288+ data := p.data(false)
289289+ chunkMatches := make([]ChunkMatch, 0, len(chunks))
290290+ for _, chunk := range chunks {
291291+ ranges := make([]Range, 0, len(chunk.candidates))
292292+ var symbolInfo []*Symbol
293293+ for i, cm := range chunk.candidates {
294294+ startOffset := cm.byteOffset
295295+ endOffset := cm.byteOffset + cm.byteMatchSz
296296+ startLine, startLineOffset, _ := newlines.atOffset(startOffset)
297297+ endLine, endLineOffset, _ := newlines.atOffset(endOffset)
298298+299299+ ranges = append(ranges, Range{
300300+ Start: Location{
301301+ ByteOffset: startOffset,
302302+ LineNumber: uint32(startLine),
303303+ Column: uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1),
304304+ },
305305+ End: Location{
306306+ ByteOffset: endOffset,
307307+ LineNumber: uint32(endLine),
308308+ Column: uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1),
309309+ },
310310+ })
311311+312312+ if cm.symbol {
313313+ if symbolInfo == nil {
314314+ symbolInfo = make([]*Symbol, len(chunk.candidates))
315315+ }
316316+ start := p.id.fileEndSymbol[p.idx]
317317+ si := p.id.symbols.data(start + cm.symbolIdx)
318318+ if si != nil {
319319+ sec := p.docSections()[cm.symbolIdx]
320320+ si.Sym = string(data[sec.Start:sec.End])
321321+ }
322322+ symbolInfo[i] = si
323323+ }
324324+ }
325325+326326+ firstLineNumber := int(chunk.firstLine) - numContextLines
327327+ if firstLineNumber < 1 {
328328+ firstLineNumber = 1
329329+ }
330330+ firstLineStart, _ := newlines.lineBounds(firstLineNumber)
331331+332332+ chunkMatches = append(chunkMatches, ChunkMatch{
333333+ Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
334334+ ContentStart: Location{
335335+ ByteOffset: firstLineStart,
336336+ LineNumber: uint32(firstLineNumber),
337337+ Column: 1,
338338+ },
339339+ FileName: false,
340340+ Ranges: ranges,
341341+ SymbolInfo: symbolInfo,
342342+ })
343343+ }
344344+ return chunkMatches
345345+}
346346+347347+type candidateChunk struct {
348348+ firstLine uint32 // 1-based, inclusive
349349+ lastLine uint32 // 1-based, inclusive
350350+ minOffset uint32 // 0-based, inclusive
351351+ maxOffset uint32 // 0-based, exclusive
352352+ candidates []*candidateMatch
353353+}
354354+355355+// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
356356+// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
357357+// it to overlap with an adjacent chunk.
358358+func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
359359+ var chunks []candidateChunk
360360+ for _, m := range ms {
361361+ startOffset := m.byteOffset
362362+ endOffset := m.byteOffset + m.byteMatchSz
363363+ firstLine, _, _ := newlines.atOffset(startOffset)
364364+ lastLine, _, _ := newlines.atOffset(endOffset)
365365+366366+ if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
367367+ // If a new chunk created with the current candidateMatch would
368368+ // overlap with the previous chunk, instead add the candidateMatch
369369+ // to the last chunk and extend end of the last chunk.
370370+ last := &chunks[len(chunks)-1]
371371+ last.candidates = append(last.candidates, m)
372372+ if last.maxOffset < endOffset {
373373+ last.lastLine = uint32(lastLine)
374374+ last.maxOffset = uint32(endOffset)
375375+ }
376376+ } else {
377377+ chunks = append(chunks, candidateChunk{
378378+ firstLine: uint32(firstLine),
379379+ lastLine: uint32(lastLine),
380380+ minOffset: startOffset,
381381+ maxOffset: endOffset,
382382+ candidates: []*candidateMatch{m},
383383+ })
384384+ }
385385+ }
386386+ return chunks
387387+}
388388+244389type newlines struct {
245390 // locs is the sorted set of byte offsets of the newlines in the file
246391 locs []uint32
···339484 return 0, false
340485}
341486487487+func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) {
488488+ type debugScore struct {
489489+ score float64
490490+ what string
491491+ }
492492+493493+ score := &debugScore{}
494494+ maxScore := &debugScore{}
495495+496496+ addScore := func(what string, s float64) {
497497+ if debug {
498498+ score.what += fmt.Sprintf("%s:%f, ", what, s)
499499+ }
500500+ score.score += s
501501+ }
502502+503503+ for i, r := range m.Ranges {
504504+ // calculate the start and end offset relative to the start of the content
505505+ relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
506506+ relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset)
507507+508508+ startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset]))
509509+ endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset]))
510510+511511+ score.score = 0
512512+ score.what = ""
513513+514514+ if startBoundary && endBoundary {
515515+ addScore("WordMatch", scoreWordMatch)
516516+ } else if startBoundary || endBoundary {
517517+ addScore("PartialWordMatch", scorePartialWordMatch)
518518+ }
519519+520520+ if m.FileName {
521521+ sep := bytes.LastIndexByte(m.Content, '/')
522522+ startMatch := relStartOffset == sep+1
523523+ endMatch := relEndOffset == len(m.Content)
524524+ if startMatch && endMatch {
525525+ addScore("Base", scoreBase)
526526+ } else if startMatch || endMatch {
527527+ addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
528528+ } else if sep < relStartOffset {
529529+ addScore("InnerBase", scorePartialBase)
530530+ }
531531+ } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok {
532532+ sec := secs[secIdx]
533533+ startMatch := sec.Start == uint32(r.Start.ByteOffset)
534534+ endMatch := sec.End == uint32(r.End.ByteOffset)
535535+ if startMatch && endMatch {
536536+ addScore("Symbol", scoreSymbol)
537537+ } else if startMatch || endMatch {
538538+ addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
539539+ } else {
540540+ addScore("InnerSymbol", scorePartialSymbol)
541541+ }
542542+543543+ var si *Symbol
544544+ if m.SymbolInfo != nil {
545545+ si = m.SymbolInfo[i]
546546+ }
547547+ if si == nil {
548548+ // for non-symbol queries, we need to hydrate in SymbolInfo.
549549+ start := p.id.fileEndSymbol[p.idx]
550550+ si = p.id.symbols.data(start + uint32(secIdx))
551551+ }
552552+ if si != nil {
553553+ addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, si.Kind))
554554+ }
555555+ }
556556+557557+ if score.score > maxScore.score {
558558+ maxScore.score = score.score
559559+ maxScore.what = score.what
560560+ }
561561+ }
562562+563563+ return maxScore.score, strings.TrimRight(maxScore.what, ", ")
564564+}
565565+342566func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
343567 type debugScore struct {
344568 score float64
···437661func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
438662func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
439663664664+type chunkMatchScoreSlice []ChunkMatch
665665+666666+func (m chunkMatchScoreSlice) Len() int { return len(m) }
667667+func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
668668+func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
669669+440670type fileMatchSlice []FileMatch
441671442672func (m fileMatchSlice) Len() int { return len(m) }
···445675446676func sortMatchesByScore(ms []LineMatch) {
447677 sort.Sort(matchScoreSlice(ms))
678678+}
679679+680680+func sortChunkMatchesByScore(ms []ChunkMatch) {
681681+ sort.Sort(chunkMatchScoreSlice(ms))
448682}
449683450684// Sort a slice of results.
···322322 visitMatches(mt, known, func(mt matchTree) {
323323 atomMatchCount++
324324 })
325325- finalCands := gatherMatches(mt, known)
325325+ shouldMergeMatches := !opts.ChunkMatches
326326+ finalCands := gatherMatches(mt, known, shouldMergeMatches)
326327327328 if len(finalCands) == 0 {
328329 nm := d.fileName(nextDoc)
···338339 byteMatchSz: uint32(len(nm)),
339340 })
340341 }
341341- fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
342342+343343+ if opts.ChunkMatches {
344344+ fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
345345+ } else {
346346+ fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
347347+ }
342348343349 maxFileScore := 0.0
344350 for i := range fileMatch.LineMatches {
···350356 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
351357 }
352358359359+ for i := range fileMatch.ChunkMatches {
360360+ if maxFileScore < fileMatch.ChunkMatches[i].Score {
361361+ maxFileScore = fileMatch.ChunkMatches[i].Score
362362+ }
363363+364364+ // Order by ordering in file.
365365+ fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
366366+ }
367367+353368 // Maintain ordering of input files. This
354369 // strictly dominates the in-file ordering of
355370 // the matches.
···365380 }
366381 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
367382 sortMatchesByScore(fileMatch.LineMatches)
383383+ sortChunkMatchesByScore(fileMatch.ChunkMatches)
368384 if opts.Whole {
369385 fileMatch.Content = cp.data(false)
370386 }
371387388388+ matchedChunkRanges := 0
389389+ for _, cm := range fileMatch.ChunkMatches {
390390+ matchedChunkRanges += len(cm.Ranges)
391391+ }
392392+372393 repoMatchCount += len(fileMatch.LineMatches)
394394+ repoMatchCount += matchedChunkRanges
373395374396 res.Files = append(res.Files, fileMatch)
375397 res.Stats.MatchCount += len(fileMatch.LineMatches)
398398+ res.Stats.MatchCount += matchedChunkRanges
376399 res.Stats.FileCount++
377400 }
378401···420443// filename/content matches: if there are content matches, all
421444// filename matches are trimmed from the result. The matches are
422445// returned in document order and are non-overlapping.
423423-func gatherMatches(mt matchTree, known map[matchTree]bool) []*candidateMatch {
446446+//
447447+// If `merge` is set, overlapping and adjacent matches will be merged
448448+// into a single match. Otherwise, overlapping matches will be removed,
449449+// but adjacent matches will remain.
450450+func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
424451 var cands []*candidateMatch
425452 visitMatches(mt, known, func(mt matchTree) {
426453 if smt, ok := mt.(*substrMatchTree); ok {
···450477 }
451478 cands = res
452479453453- // Merge adjacent candidates. This guarantees that the matches
454454- // are non-overlapping.
455455- sort.Sort((sortByOffsetSlice)(cands))
456456- res = cands[:0]
457457- for i, c := range cands {
458458- if i == 0 {
480480+ if merge {
481481+ // Merge adjacent candidates. This guarantees that the matches
482482+ // are non-overlapping.
483483+ sort.Sort((sortByOffsetSlice)(cands))
484484+ res = cands[:0]
485485+ for i, c := range cands {
486486+ if i == 0 {
487487+ res = append(res, c)
488488+ continue
489489+ }
490490+ last := res[len(res)-1]
491491+ lastEnd := last.byteOffset + last.byteMatchSz
492492+ end := c.byteOffset + c.byteMatchSz
493493+ if lastEnd >= c.byteOffset {
494494+ if end > lastEnd {
495495+ last.byteMatchSz = end - last.byteOffset
496496+ }
497497+ continue
498498+ }
499499+459500 res = append(res, c)
460460- continue
461501 }
462462- last := res[len(res)-1]
463463- lastEnd := last.byteOffset + last.byteMatchSz
464464- end := c.byteOffset + c.byteMatchSz
465465- if lastEnd >= c.byteOffset {
466466- if end > lastEnd {
467467- last.byteMatchSz = end - last.byteOffset
502502+ } else {
503503+ // Remove overlapping candidates. This guarantees that the matches
504504+ // are non-overlapping, but also preserves expected match counts.
505505+ sort.Sort((sortByOffsetSlice)(cands))
506506+ res = cands[:0]
507507+ for i, c := range cands {
508508+ if i == 0 {
509509+ res = append(res, c)
510510+ continue
511511+ }
512512+ last := res[len(res)-1]
513513+ lastEnd := last.byteOffset + last.byteMatchSz
514514+ if lastEnd > c.byteOffset {
515515+ continue
468516 }
469469- continue
517517+518518+ res = append(res, c)
470519 }
471471-472472- res = append(res, c)
473520 }
474521475522 return res