matchtree.go at 5250e0e52a1bd6c2ffd2c74a91ffe090ba47e896 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / matchtree.go
at 5250e0e52a1bd6c2ffd2c74a91ffe090ba47e896 29 kB View raw
Keegan Carruthers-Smith matchtree: disable word search optimization for symbol search (#571) 3y ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"regexp/syntax"
  22	"strings"
  23	"unicode/utf8"
  24
  25	"github.com/grafana/regexp"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30// A docIterator iterates over documents in order.
  31type docIterator interface {
  32	// provide the next document where we can may find something
  33	// interesting.
  34	nextDoc() uint32
  35
  36	// clears any per-document state of the docIterator, and
  37	// prepares for evaluating the given doc. The argument is
  38	// strictly increasing over time.
  39	prepare(nextDoc uint32)
  40}
  41
  42const (
  43	costConst   = 0
  44	costMemory  = 1
  45	costContent = 2
  46	costRegexp  = 3
  47)
  48
  49const (
  50	costMin = costConst
  51	costMax = costRegexp
  52)
  53
  54// An expression tree coupled with matches. The matchtree has two
  55// functions:
  56//
  57// * it implements boolean combinations (and, or, not)
  58//
  59// * it implements shortcuts, where we skip documents (for example: if
  60// there are no trigram matches, we can be sure there are no substring
  61// matches). The matchtree iterates over the documents as they are
  62// ordered in the shard.
  63//
  64// The general process for a given (shard, query) is:
  65//
  66// - construct matchTree for the query
  67//
  68// - find all different leaf matchTrees (substring, regexp, etc.)
  69//
  70// in a loop:
  71//
  72//   - find next doc to process using nextDoc
  73//
  74//   - evaluate atoms (leaf expressions that match text)
  75//
  76//   - evaluate the tree using matches(), storing the result in map.
  77//
  78//   - if the complete tree returns (matches() == true) for the document,
  79//     collect all text matches by looking at leaf matchTrees
  80type matchTree interface {
  81	docIterator
  82
  83	// returns whether this matches, and if we are sure.
  84	matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool)
  85}
  86
  87// docMatchTree iterates over documents for which predicate(docID) returns true.
  88type docMatchTree struct {
  89	// the number of documents in a shard.
  90	numDocs uint32
  91
  92	predicate func(docID uint32) bool
  93
  94	// provides additional information about the reason why the docMatchTree was
  95	// created.
  96	reason string
  97
  98	// mutable
  99	firstDone bool
 100	docID     uint32
 101}
 102
 103type bruteForceMatchTree struct {
 104	// mutable
 105	firstDone bool
 106	docID     uint32
 107}
 108
 109type andLineMatchTree struct {
 110	andMatchTree
 111}
 112
 113type andMatchTree struct {
 114	children []matchTree
 115}
 116
 117type orMatchTree struct {
 118	children []matchTree
 119}
 120
 121type notMatchTree struct {
 122	child matchTree
 123}
 124
 125// Returns only the filename of child matches.
 126type fileNameMatchTree struct {
 127	child matchTree
 128}
 129
 130// Don't visit this subtree for collecting matches.
 131type noVisitMatchTree struct {
 132	matchTree
 133}
 134
 135type regexpMatchTree struct {
 136	regexp *regexp.Regexp
 137
 138	fileName bool
 139
 140	// mutable
 141	reEvaluated bool
 142	found       []*candidateMatch
 143
 144	// nextDoc, prepare.
 145	bruteForceMatchTree
 146}
 147
 148// \bLITERAL\b
 149type wordMatchTree struct {
 150	word string
 151
 152	fileName bool
 153
 154	// mutable
 155	evaluated bool
 156	found     []*candidateMatch
 157
 158	// nextDoc, prepare.
 159	bruteForceMatchTree
 160}
 161
 162type substrMatchTree struct {
 163	matchIterator
 164
 165	query         *query.Substring
 166	caseSensitive bool
 167	fileName      bool
 168
 169	// mutable
 170	current       []*candidateMatch
 171	contEvaluated bool
 172}
 173
 174type branchQueryMatchTree struct {
 175	fileMasks []uint64
 176	masks     []uint64
 177	repos     []uint16
 178
 179	// mutable
 180	firstDone bool
 181	docID     uint32
 182}
 183
 184func (t *branchQueryMatchTree) branchMask() uint64 {
 185	return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
 186}
 187
 188type symbolRegexpMatchTree struct {
 189	matchTree
 190	regexp *regexp.Regexp
 191	all    bool // skips regex match if .*
 192
 193	reEvaluated bool
 194	found       []*candidateMatch
 195}
 196
 197func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 198	t.reEvaluated = false
 199}
 200
 201func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 202	if t.reEvaluated {
 203		return len(t.found) > 0, true
 204	}
 205
 206	if cost < costRegexp {
 207		return false, false
 208	}
 209
 210	sections := cp.docSections()
 211	content := cp.data(false)
 212
 213	found := t.found[:0]
 214	for i, sec := range sections {
 215		var idx []int
 216		if t.all {
 217			idx = []int{0, int(sec.End - sec.Start)}
 218		} else {
 219			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 220			if idx == nil {
 221				continue
 222			}
 223		}
 224
 225		cm := &candidateMatch{
 226			byteOffset:  sec.Start + uint32(idx[0]),
 227			byteMatchSz: uint32(idx[1] - idx[0]),
 228			symbol:      true,
 229			symbolIdx:   uint32(i),
 230		}
 231		found = append(found, cm)
 232	}
 233	t.found = found
 234	t.reEvaluated = true
 235
 236	return len(t.found) > 0, true
 237}
 238
 239type symbolSubstrMatchTree struct {
 240	*substrMatchTree
 241
 242	patternSize   uint32
 243	fileEndRunes  []uint32
 244	fileEndSymbol []uint32
 245
 246	doc      uint32
 247	sections []DocumentSection
 248
 249	secID uint32
 250}
 251
 252func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 253	t.substrMatchTree.prepare(doc)
 254	t.doc = doc
 255
 256	var fileStart uint32
 257	if doc > 0 {
 258		fileStart = t.fileEndRunes[doc-1]
 259	}
 260
 261	var sections []DocumentSection
 262	if len(t.sections) > 0 {
 263		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 264		if most == uint32(len(t.sections)) {
 265			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 266		} else {
 267			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 268				t.secID++
 269			}
 270
 271			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 272			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 273				symbolEnd++
 274			}
 275
 276			sections = t.sections[t.secID:symbolEnd]
 277		}
 278	}
 279
 280	secIdx := 0
 281	trimmed := t.current[:0]
 282	for len(sections) > secIdx && len(t.current) > 0 {
 283		start := fileStart + t.current[0].runeOffset
 284		end := start + t.patternSize
 285		if start >= sections[secIdx].End {
 286			secIdx++
 287			continue
 288		}
 289
 290		if start < sections[secIdx].Start {
 291			t.current = t.current[1:]
 292			continue
 293		}
 294
 295		if end <= sections[secIdx].End {
 296			t.current[0].symbol = true
 297			t.current[0].symbolIdx = uint32(secIdx)
 298			trimmed = append(trimmed, t.current[0])
 299		}
 300
 301		t.current = t.current[1:]
 302	}
 303	t.current = trimmed
 304}
 305
 306// all prepare methods
 307
 308func (t *bruteForceMatchTree) prepare(doc uint32) {
 309	t.docID = doc
 310	t.firstDone = true
 311}
 312
 313func (t *docMatchTree) prepare(doc uint32) {
 314	t.docID = doc
 315	t.firstDone = true
 316}
 317
 318func (t *andMatchTree) prepare(doc uint32) {
 319	for _, c := range t.children {
 320		c.prepare(doc)
 321	}
 322}
 323
 324func (t *regexpMatchTree) prepare(doc uint32) {
 325	t.found = t.found[:0]
 326	t.reEvaluated = false
 327	t.bruteForceMatchTree.prepare(doc)
 328}
 329
 330func (t *wordMatchTree) prepare(doc uint32) {
 331	t.found = t.found[:0]
 332	t.evaluated = false
 333	t.bruteForceMatchTree.prepare(doc)
 334}
 335
 336func (t *orMatchTree) prepare(doc uint32) {
 337	for _, c := range t.children {
 338		c.prepare(doc)
 339	}
 340}
 341
 342func (t *notMatchTree) prepare(doc uint32) {
 343	t.child.prepare(doc)
 344}
 345
 346func (t *fileNameMatchTree) prepare(doc uint32) {
 347	t.child.prepare(doc)
 348}
 349
 350func (t *substrMatchTree) prepare(nextDoc uint32) {
 351	t.matchIterator.prepare(nextDoc)
 352	t.current = t.matchIterator.candidates()
 353	t.contEvaluated = false
 354}
 355
 356func (t *branchQueryMatchTree) prepare(doc uint32) {
 357	t.firstDone = true
 358	t.docID = doc
 359}
 360
 361// nextDoc
 362
 363func (t *docMatchTree) nextDoc() uint32 {
 364	var start uint32
 365	if t.firstDone {
 366		start = t.docID + 1
 367	}
 368	for i := start; i < t.numDocs; i++ {
 369		if t.predicate(i) {
 370			return i
 371		}
 372	}
 373	return maxUInt32
 374}
 375
 376func (t *bruteForceMatchTree) nextDoc() uint32 {
 377	if !t.firstDone {
 378		return 0
 379	}
 380	return t.docID + 1
 381}
 382
 383func (t *andMatchTree) nextDoc() uint32 {
 384	var max uint32
 385	for _, c := range t.children {
 386		m := c.nextDoc()
 387		if m > max {
 388			max = m
 389		}
 390	}
 391	return max
 392}
 393
 394func (t *orMatchTree) nextDoc() uint32 {
 395	min := uint32(maxUInt32)
 396	for _, c := range t.children {
 397		m := c.nextDoc()
 398		if m < min {
 399			min = m
 400		}
 401	}
 402	return min
 403}
 404
 405func (t *notMatchTree) nextDoc() uint32 {
 406	return 0
 407}
 408
 409func (t *fileNameMatchTree) nextDoc() uint32 {
 410	return t.child.nextDoc()
 411}
 412
 413func (t *branchQueryMatchTree) nextDoc() uint32 {
 414	var start uint32
 415	if t.firstDone {
 416		start = t.docID + 1
 417	}
 418
 419	for i := start; i < uint32(len(t.fileMasks)); i++ {
 420		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 421			return i
 422		}
 423	}
 424	return maxUInt32
 425}
 426
 427// all String methods
 428
 429func (t *bruteForceMatchTree) String() string {
 430	return "all"
 431}
 432
 433func (t *docMatchTree) String() string {
 434	return fmt.Sprintf("doc(%s)", t.reason)
 435}
 436
 437func (t *andMatchTree) String() string {
 438	return fmt.Sprintf("and%v", t.children)
 439}
 440
 441func (t *regexpMatchTree) String() string {
 442	f := ""
 443	if t.fileName {
 444		f = "f"
 445	}
 446	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 447}
 448
 449func (t *wordMatchTree) String() string {
 450	f := ""
 451	if t.fileName {
 452		f = "f"
 453	}
 454	return fmt.Sprintf("%sword(%s)", f, t.word)
 455}
 456
 457func (t *orMatchTree) String() string {
 458	return fmt.Sprintf("or%v", t.children)
 459}
 460
 461func (t *notMatchTree) String() string {
 462	return fmt.Sprintf("not(%v)", t.child)
 463}
 464
 465func (t *noVisitMatchTree) String() string {
 466	return fmt.Sprintf("novisit(%v)", t.matchTree)
 467}
 468
 469func (t *fileNameMatchTree) String() string {
 470	return fmt.Sprintf("f(%v)", t.child)
 471}
 472
 473func (t *substrMatchTree) String() string {
 474	f := ""
 475	if t.fileName {
 476		f = "f"
 477	}
 478
 479	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 480}
 481
 482func (t *branchQueryMatchTree) String() string {
 483	return fmt.Sprintf("branch(%x)", t.masks)
 484}
 485
 486func (t *symbolSubstrMatchTree) String() string {
 487	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 488}
 489
 490func (t *symbolRegexpMatchTree) String() string {
 491	return fmt.Sprintf("symbol(%v)", t.matchTree)
 492}
 493
 494// visitMatches visits all atoms in matchTree. Note: This visits
 495// noVisitMatchTree. For collecting matches use visitMatches.
 496func visitMatchTree(t matchTree, f func(matchTree)) {
 497	switch s := t.(type) {
 498	case *andMatchTree:
 499		for _, ch := range s.children {
 500			visitMatchTree(ch, f)
 501		}
 502	case *orMatchTree:
 503		for _, ch := range s.children {
 504			visitMatchTree(ch, f)
 505		}
 506	case *andLineMatchTree:
 507		visitMatchTree(&s.andMatchTree, f)
 508	case *noVisitMatchTree:
 509		visitMatchTree(s.matchTree, f)
 510	case *notMatchTree:
 511		visitMatchTree(s.child, f)
 512	case *fileNameMatchTree:
 513		visitMatchTree(s.child, f)
 514	case *symbolSubstrMatchTree:
 515		visitMatchTree(s.substrMatchTree, f)
 516	case *symbolRegexpMatchTree:
 517		visitMatchTree(s.matchTree, f)
 518	default:
 519		f(t)
 520	}
 521}
 522
 523// visitMatches visits all atoms which can contribute matches. Note: This
 524// skips noVisitMatchTree.
 525func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 526	switch s := t.(type) {
 527	case *andMatchTree:
 528		for _, ch := range s.children {
 529			if known[ch] {
 530				visitMatches(ch, known, f)
 531			}
 532		}
 533	case *andLineMatchTree:
 534		visitMatches(&s.andMatchTree, known, f)
 535	case *orMatchTree:
 536		for _, ch := range s.children {
 537			if known[ch] {
 538				visitMatches(ch, known, f)
 539			}
 540		}
 541	case *symbolSubstrMatchTree:
 542		visitMatches(s.substrMatchTree, known, f)
 543	case *notMatchTree:
 544	case *noVisitMatchTree:
 545		// don't collect into negative trees.
 546	case *fileNameMatchTree:
 547		// We will just gather the filename if we do not visit this tree.
 548	default:
 549		f(s)
 550	}
 551}
 552
 553// all matches() methods.
 554
 555func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 556	return t.predicate(cp.idx), true
 557}
 558
 559func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 560	return true, true
 561}
 562
 563// andLineMatchTree is a performance optimization of andMatchTree. For content
 564// searches we don't want to run the regex engine if there is no line that
 565// contains matches from all terms.
 566func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 567	matches, sure := t.andMatchTree.matches(cp, cost, known)
 568	if !(sure && matches) {
 569		return matches, sure
 570	}
 571
 572	// find child with fewest candidates
 573	min := maxUInt32
 574	fewestChildren := 0
 575	for ix, child := range t.children {
 576		v, ok := child.(*substrMatchTree)
 577		// make sure we are running a content search and that all candidates are a
 578		// substrMatchTree
 579		if !ok || v.fileName {
 580			return matches, sure
 581		}
 582		if len(v.current) < min {
 583			min = len(v.current)
 584			fewestChildren = ix
 585		}
 586	}
 587
 588	type lineRange struct {
 589		start int
 590		end   int
 591	}
 592	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 593	prev := -1
 594	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 595		line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
 596		if line == prev {
 597			continue
 598		}
 599		prev = line
 600		lines = append(lines, lineRange{byteStart, byteEnd})
 601	}
 602
 603	// children keeps track of the children's candidates we have already seen.
 604	children := make([][]*candidateMatch, 0, len(t.children)-1)
 605	for j, child := range t.children {
 606		if j == fewestChildren {
 607			continue
 608		}
 609		children = append(children, child.(*substrMatchTree).current)
 610	}
 611
 612nextLine:
 613	for i := 0; i < len(lines); i++ {
 614		hits := 1
 615	nextChild:
 616		for j := range children {
 617		nextCandidate:
 618			for len(children[j]) > 0 {
 619				candidate := children[j][0]
 620				bo := int(cp.findOffset(false, candidate.runeOffset))
 621				if bo < lines[i].start {
 622					children[j] = children[j][1:]
 623					continue nextCandidate
 624				}
 625				if bo <= lines[i].end {
 626					hits++
 627					continue nextChild
 628				}
 629				// move the `lines` iterator forward until bo <= line.end
 630				for i < len(lines) && bo > lines[i].end {
 631					i++
 632				}
 633				i--
 634				continue nextLine
 635			}
 636		}
 637		// return early once we found any line that contains matches from all children
 638		if hits == len(t.children) {
 639			return matches, true
 640		}
 641	}
 642	return false, true
 643}
 644
 645func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 646	sure := true
 647
 648	for _, ch := range t.children {
 649		v, ok := evalMatchTree(cp, cost, known, ch)
 650		if ok && !v {
 651			return false, true
 652		}
 653		if !ok {
 654			sure = false
 655		}
 656	}
 657
 658	return true, sure
 659}
 660
 661func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 662	matches := false
 663	sure := true
 664	for _, ch := range t.children {
 665		v, ok := evalMatchTree(cp, cost, known, ch)
 666		if ok {
 667			// we could short-circuit, but we want to use
 668			// the other possibilities as a ranking
 669			// signal.
 670			matches = matches || v
 671		} else {
 672			sure = false
 673		}
 674	}
 675	return matches, sure
 676}
 677
 678func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 679	return t.branchMask() != 0, true
 680}
 681
 682func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 683	if t.reEvaluated {
 684		return len(t.found) > 0, true
 685	}
 686
 687	if cost < costRegexp {
 688		return false, false
 689	}
 690
 691	cp.stats.RegexpsConsidered++
 692	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 693	found := t.found[:0]
 694	for _, idx := range idxs {
 695		cm := &candidateMatch{
 696			byteOffset:  uint32(idx[0]),
 697			byteMatchSz: uint32(idx[1] - idx[0]),
 698			fileName:    t.fileName,
 699		}
 700
 701		found = append(found, cm)
 702	}
 703	t.found = found
 704	t.reEvaluated = true
 705
 706	return len(t.found) > 0, true
 707}
 708
 709func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 710	if t.evaluated {
 711		return len(t.found) > 0, true
 712	}
 713
 714	if cost < costRegexp {
 715		return false, false
 716	}
 717
 718	data := cp.data(t.fileName)
 719	offset := 0
 720	found := t.found[:0]
 721	for {
 722		idx := bytes.Index(data[offset:], []byte(t.word))
 723		if idx < 0 {
 724			break
 725		}
 726
 727		relStartOffset := offset + idx
 728		relEndOffset := relStartOffset + len(t.word)
 729
 730		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 731		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 732		if startBoundary && endBoundary {
 733			found = append(found, &candidateMatch{
 734				byteOffset:  uint32(offset + idx),
 735				byteMatchSz: uint32(len(t.word)),
 736				fileName:    t.fileName,
 737			})
 738		}
 739		offset += idx + len(t.word)
 740	}
 741
 742	t.found = found
 743	t.evaluated = true
 744
 745	return len(t.found) > 0, true
 746}
 747
 748// breakMatchesOnNewlines returns matches resulting from breaking each element
 749// of cms on newlines within text.
 750func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 751	var lineCMs []*candidateMatch
 752	for _, cm := range cms {
 753		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 754	}
 755	return lineCMs
 756}
 757
 758// breakOnNewlines returns matches resulting from breaking cm on newlines
 759// within text.
 760func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 761	var cms []*candidateMatch
 762	addMe := &candidateMatch{}
 763	*addMe = *cm
 764	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 765		if text[i] == '\n' {
 766			addMe.byteMatchSz = i - addMe.byteOffset
 767			if addMe.byteMatchSz != 0 {
 768				cms = append(cms, addMe)
 769			}
 770
 771			addMe = &candidateMatch{}
 772			*addMe = *cm
 773			addMe.byteOffset = i + 1
 774		}
 775	}
 776	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 777	if addMe.byteMatchSz != 0 {
 778		cms = append(cms, addMe)
 779	}
 780	return cms
 781}
 782
 783func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
 784	if v, ok := known[mt]; ok {
 785		return v, true
 786	}
 787
 788	v, ok := mt.matches(cp, cost, known)
 789	if ok {
 790		known[mt] = v
 791	}
 792
 793	return v, ok
 794}
 795
 796func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 797	v, ok := evalMatchTree(cp, cost, known, t.child)
 798	return !v, ok
 799}
 800
 801func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 802	return evalMatchTree(cp, cost, known, t.child)
 803}
 804
 805func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 806	if t.contEvaluated {
 807		return len(t.current) > 0, true
 808	}
 809
 810	if len(t.current) == 0 {
 811		return false, true
 812	}
 813
 814	if t.fileName && cost < costMemory {
 815		return false, false
 816	}
 817
 818	if !t.fileName && cost < costContent {
 819		return false, false
 820	}
 821
 822	pruned := t.current[:0]
 823	for _, m := range t.current {
 824		if m.byteOffset == 0 && m.runeOffset > 0 {
 825			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 826		}
 827		if m.matchContent(cp.data(m.fileName)) {
 828			pruned = append(pruned, m)
 829		}
 830	}
 831	t.current = pruned
 832	t.contEvaluated = true
 833
 834	return len(t.current) > 0, true
 835}
 836
 837type matchTreeOpt struct {
 838	// DisableWordMatchOptimization is used to disable the use of wordMatchTree.
 839	// This was added since we do not support wordMatchTree with symbol search.
 840	DisableWordMatchOptimization bool
 841}
 842
 843func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
 844	if q == nil {
 845		return nil, fmt.Errorf("got nil (sub)query")
 846	}
 847	switch s := q.(type) {
 848	case *query.Regexp:
 849		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 850		// superset of the regexp. If the returned matchTree is equivalent to the
 851		// original regexp, it returns true. An equivalent matchTree has the same
 852		// behaviour as the original regexp and can be used instead.
 853		//
 854		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 855		if err != nil {
 856			return nil, err
 857		}
 858		// if the query can be used in place of the regexp
 859		// return the subtree
 860		if isEq {
 861			return subMT, nil
 862		}
 863
 864		var tr matchTree
 865		if wmt, ok := regexpToWordMatchTree(s, opt); ok {
 866			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
 867			// provide something faster.
 868			tr = wmt
 869		} else {
 870			prefix := ""
 871			if !s.CaseSensitive {
 872				prefix = "(?i)"
 873			}
 874
 875			tr = &regexpMatchTree{
 876				regexp:   regexp.MustCompile(prefix + s.Regexp.String()),
 877				fileName: s.FileName,
 878			}
 879		}
 880
 881		return &andMatchTree{
 882			children: []matchTree{
 883				tr, &noVisitMatchTree{subMT},
 884			},
 885		}, nil
 886	case *query.And:
 887		var r []matchTree
 888		for _, ch := range s.Children {
 889			ct, err := d.newMatchTree(ch, opt)
 890			if err != nil {
 891				return nil, err
 892			}
 893			r = append(r, ct)
 894		}
 895		return &andMatchTree{r}, nil
 896	case *query.Or:
 897		var r []matchTree
 898		for _, ch := range s.Children {
 899			ct, err := d.newMatchTree(ch, opt)
 900			if err != nil {
 901				return nil, err
 902			}
 903			r = append(r, ct)
 904		}
 905		return &orMatchTree{r}, nil
 906	case *query.Not:
 907		ct, err := d.newMatchTree(s.Child, opt)
 908		return &notMatchTree{
 909			child: ct,
 910		}, err
 911
 912	case *query.Type:
 913		if s.Type != query.TypeFileName {
 914			break
 915		}
 916
 917		ct, err := d.newMatchTree(s.Child, opt)
 918		if err != nil {
 919			return nil, err
 920		}
 921
 922		return &fileNameMatchTree{
 923			child: ct,
 924		}, nil
 925
 926	case *query.Substring:
 927		return d.newSubstringMatchTree(s)
 928
 929	case *query.Branch:
 930		masks := make([]uint64, 0, len(d.repoMetaData))
 931		if s.Pattern == "HEAD" {
 932			for i := 0; i < len(d.repoMetaData); i++ {
 933				masks = append(masks, 1)
 934			}
 935		} else {
 936			for _, branchIDs := range d.branchIDs {
 937				mask := uint64(0)
 938				for nm, m := range branchIDs {
 939					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
 940						mask |= uint64(m)
 941					}
 942				}
 943				masks = append(masks, mask)
 944			}
 945
 946		}
 947		return &branchQueryMatchTree{
 948			masks:     masks,
 949			fileMasks: d.fileBranchMasks,
 950			repos:     d.repos,
 951		}, nil
 952	case *query.Const:
 953		if s.Value {
 954			return &bruteForceMatchTree{}, nil
 955		} else {
 956			return &noMatchTree{"const"}, nil
 957		}
 958	case *query.Language:
 959		code, ok := d.metaData.LanguageMap[s.Language]
 960		if !ok {
 961			return &noMatchTree{"lang"}, nil
 962		}
 963		return &docMatchTree{
 964			reason:  "language",
 965			numDocs: d.numDocs(),
 966			predicate: func(docID uint32) bool {
 967				return d.getLanguage(docID) == code
 968			},
 969		}, nil
 970
 971	case *query.Symbol:
 972		// Disable WordMatchTree since we don't support it in symbols yet.
 973		optCopy := opt
 974		optCopy.DisableWordMatchOptimization = true
 975
 976		subMT, err := d.newMatchTree(s.Expr, optCopy)
 977		if err != nil {
 978			return nil, err
 979		}
 980
 981		if substr, ok := subMT.(*substrMatchTree); ok {
 982			// We have a feature flag for lazy decoding. If runeDocSectionsRaw is
 983			// non-nil it means we need to lazily decode on request.
 984			sections := d.runeDocSections
 985			if sections == nil && d.runeDocSectionsRaw != nil {
 986				sections = unmarshalDocSections(d.runeDocSectionsRaw, nil)
 987			}
 988
 989			return &symbolSubstrMatchTree{
 990				substrMatchTree: substr,
 991				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
 992				fileEndRunes:    d.fileEndRunes,
 993				fileEndSymbol:   d.fileEndSymbol,
 994				sections:        sections,
 995			}, nil
 996		}
 997
 998		var regexp *regexp.Regexp
 999		visitMatchTree(subMT, func(mt matchTree) {
1000			if t, ok := mt.(*regexpMatchTree); ok {
1001				regexp = t.regexp
1002			}
1003		})
1004		if regexp == nil {
1005			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1006		}
1007
1008		return &symbolRegexpMatchTree{
1009			regexp:    regexp,
1010			all:       regexp.String() == "(?i)(?-s:.)*",
1011			matchTree: subMT,
1012		}, nil
1013
1014	case *query.FileNameSet:
1015		return &docMatchTree{
1016			reason:  "FileNameSet",
1017			numDocs: d.numDocs(),
1018			predicate: func(docID uint32) bool {
1019				fileName := d.fileName(docID)
1020				_, ok := s.Set[string(fileName)]
1021				return ok
1022			},
1023		}, nil
1024
1025	case *query.BranchesRepos:
1026		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1027		for repoIdx := range d.repoMetaData {
1028			var mask uint64
1029			for _, br := range s.List {
1030				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1031					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1032				}
1033			}
1034			reposBranchesWant[repoIdx] = mask
1035		}
1036		return &docMatchTree{
1037			reason:  "BranchesRepos",
1038			numDocs: d.numDocs(),
1039			predicate: func(docID uint32) bool {
1040				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1041			},
1042		}, nil
1043
1044	case *query.RepoSet:
1045		reposWant := make([]bool, len(d.repoMetaData))
1046		for repoIdx, r := range d.repoMetaData {
1047			if _, ok := s.Set[r.Name]; ok {
1048				reposWant[repoIdx] = true
1049			}
1050		}
1051		return &docMatchTree{
1052			reason:  "RepoSet",
1053			numDocs: d.numDocs(),
1054			predicate: func(docID uint32) bool {
1055				return reposWant[d.repos[docID]]
1056			},
1057		}, nil
1058
1059	case *query.RepoIDs:
1060		reposWant := make([]bool, len(d.repoMetaData))
1061		for repoIdx, r := range d.repoMetaData {
1062			if s.Repos.Contains(r.ID) {
1063				reposWant[repoIdx] = true
1064			}
1065		}
1066		return &docMatchTree{
1067			reason:  "RepoIDs",
1068			numDocs: d.numDocs(),
1069			predicate: func(docID uint32) bool {
1070				return reposWant[d.repos[docID]]
1071			},
1072		}, nil
1073
1074	case *query.Repo:
1075		reposWant := make([]bool, len(d.repoMetaData))
1076		for repoIdx, r := range d.repoMetaData {
1077			if s.Regexp.MatchString(r.Name) {
1078				reposWant[repoIdx] = true
1079			}
1080		}
1081		return &docMatchTree{
1082			reason:  "Repo",
1083			numDocs: d.numDocs(),
1084			predicate: func(docID uint32) bool {
1085				return reposWant[d.repos[docID]]
1086			},
1087		}, nil
1088
1089	case *query.RepoRegexp:
1090		reposWant := make([]bool, len(d.repoMetaData))
1091		for repoIdx, r := range d.repoMetaData {
1092			if s.Regexp.MatchString(r.Name) {
1093				reposWant[repoIdx] = true
1094			}
1095		}
1096		return &docMatchTree{
1097			reason:  "RepoRegexp",
1098			numDocs: d.numDocs(),
1099			predicate: func(docID uint32) bool {
1100				return reposWant[d.repos[docID]]
1101			},
1102		}, nil
1103
1104	case query.RawConfig:
1105		return &docMatchTree{
1106			reason:  s.String(),
1107			numDocs: d.numDocs(),
1108			predicate: func(docID uint32) bool {
1109				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1110			},
1111		}, nil
1112	}
1113	log.Panicf("type %T", q)
1114	return nil, nil
1115}
1116
1117func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1118	st := &substrMatchTree{
1119		query:         s,
1120		caseSensitive: s.CaseSensitive,
1121		fileName:      s.FileName,
1122	}
1123
1124	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1125		prefix := ""
1126		if !s.CaseSensitive {
1127			prefix = "(?i)"
1128		}
1129		t := &regexpMatchTree{
1130			regexp:   regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1131			fileName: s.FileName,
1132		}
1133		return t, nil
1134	}
1135
1136	result, err := d.iterateNgrams(s)
1137	if err != nil {
1138		return nil, err
1139	}
1140	st.matchIterator = result
1141	return st, nil
1142}
1143
1144func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1145	if opt.DisableWordMatchOptimization {
1146		return nil, false
1147	}
1148	// Needs to be case sensitive
1149	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1150		return nil, false
1151	}
1152	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1153	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1154		return nil, false
1155	}
1156	sub := q.Regexp.Sub
1157	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1158		return nil, false
1159	}
1160
1161	return &wordMatchTree{
1162		word:     string(sub[1].Rune),
1163		fileName: q.FileName,
1164	}, true
1165}
1166
1167// pruneMatchTree removes impossible branches from the matchTree, as indicated
1168// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1169func pruneMatchTree(mt matchTree) (matchTree, error) {
1170	var err error
1171	switch mt := mt.(type) {
1172	// leaf nodes that we test with the filter:
1173	case *substrMatchTree:
1174		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1175			if _, ok := res.matchIterator.(*noMatchTree); ok {
1176				return nil, nil
1177			}
1178		}
1179	// recursive tree structures:
1180	case *andMatchTree:
1181		// Any branch of an and becoming impossible means the entire clause
1182		// is impossible. Otherwise, just handle rewrites.
1183		for i, child := range mt.children {
1184			newChild, err := pruneMatchTree(child)
1185			if err != nil {
1186				return nil, err
1187			}
1188			if newChild == nil {
1189				return nil, nil
1190			}
1191			mt.children[i] = newChild
1192		}
1193	case *orMatchTree:
1194		// *All* branches of an OR becoming impossible means the entire clause
1195		// is impossible. Otherwise, drop impossible subclauses and handle
1196		// rewrites, including simplifying to a singular resulting child branch.
1197		n := 0
1198		for _, child := range mt.children {
1199			newChild, err := pruneMatchTree(child)
1200			if err != nil {
1201				return nil, err
1202			}
1203			if newChild != nil {
1204				mt.children[n] = newChild
1205				n++
1206			}
1207		}
1208		mt.children = mt.children[:n]
1209		if len(mt.children) == 1 {
1210			return mt.children[0], nil
1211		} else if len(mt.children) == 0 {
1212			return nil, nil
1213		}
1214	case *noVisitMatchTree:
1215		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1216		if err != nil {
1217			return nil, err
1218		}
1219		if mt.matchTree == nil {
1220			return nil, nil
1221		}
1222	case *fileNameMatchTree:
1223		mt.child, err = pruneMatchTree(mt.child)
1224	case *andLineMatchTree:
1225		child, err := pruneMatchTree(&mt.andMatchTree)
1226		if err != nil {
1227			return nil, err
1228		}
1229		if child == nil {
1230			return nil, nil
1231		}
1232		if c, ok := child.(*andMatchTree); ok {
1233			mt.andMatchTree = *c
1234		} else {
1235			// the and was simplified to a single clause,
1236			// so the linematch portion is irrelevant.
1237			return mt, nil
1238		}
1239	case *notMatchTree:
1240		mt.child, err = pruneMatchTree(mt.child)
1241		if err != nil {
1242			return nil, err
1243		}
1244		if mt.child == nil {
1245			// not false => true
1246			return &bruteForceMatchTree{}, nil
1247		}
1248	// unhandled:
1249	case *docMatchTree:
1250	case *bruteForceMatchTree:
1251	case *regexpMatchTree:
1252	case *wordMatchTree:
1253	}
1254	return mt, err
1255}
Configure Feed

Configure Feed