matchtree.go at dc41c6e3cc84a1c8c830bd7365b71cc2621e6a61 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / matchtree.go
at dc41c6e3cc84a1c8c830bd7365b71cc2621e6a61 29 kB View raw
Keegan Carruthers-Smith remove ZOEKT_ENABLE_LAZY_DOC_SECTIONS (#620) 2y ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"regexp/syntax"
  22	"strings"
  23	"unicode/utf8"
  24
  25	"github.com/grafana/regexp"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30// A docIterator iterates over documents in order.
  31type docIterator interface {
  32	// provide the next document where we can may find something
  33	// interesting.
  34	nextDoc() uint32
  35
  36	// clears any per-document state of the docIterator, and
  37	// prepares for evaluating the given doc. The argument is
  38	// strictly increasing over time.
  39	prepare(nextDoc uint32)
  40}
  41
  42const (
  43	costConst   = 0
  44	costMemory  = 1
  45	costContent = 2
  46	costRegexp  = 3
  47)
  48
  49const (
  50	costMin = costConst
  51	costMax = costRegexp
  52)
  53
  54// An expression tree coupled with matches. The matchtree has two
  55// functions:
  56//
  57// * it implements boolean combinations (and, or, not)
  58//
  59// * it implements shortcuts, where we skip documents (for example: if
  60// there are no trigram matches, we can be sure there are no substring
  61// matches). The matchtree iterates over the documents as they are
  62// ordered in the shard.
  63//
  64// The general process for a given (shard, query) is:
  65//
  66// - construct matchTree for the query
  67//
  68// - find all different leaf matchTrees (substring, regexp, etc.)
  69//
  70// in a loop:
  71//
  72//   - find next doc to process using nextDoc
  73//
  74//   - evaluate atoms (leaf expressions that match text)
  75//
  76//   - evaluate the tree using matches(), storing the result in map.
  77//
  78//   - if the complete tree returns (matches() == true) for the document,
  79//     collect all text matches by looking at leaf matchTrees
  80type matchTree interface {
  81	docIterator
  82
  83	// returns whether this matches, and if we are sure.
  84	matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool)
  85}
  86
  87// docMatchTree iterates over documents for which predicate(docID) returns true.
  88type docMatchTree struct {
  89	// the number of documents in a shard.
  90	numDocs uint32
  91
  92	predicate func(docID uint32) bool
  93
  94	// provides additional information about the reason why the docMatchTree was
  95	// created.
  96	reason string
  97
  98	// mutable
  99	firstDone bool
 100	docID     uint32
 101}
 102
 103type bruteForceMatchTree struct {
 104	// mutable
 105	firstDone bool
 106	docID     uint32
 107}
 108
 109type andLineMatchTree struct {
 110	andMatchTree
 111}
 112
 113type andMatchTree struct {
 114	children []matchTree
 115}
 116
 117type orMatchTree struct {
 118	children []matchTree
 119}
 120
 121type notMatchTree struct {
 122	child matchTree
 123}
 124
 125// Returns only the filename of child matches.
 126type fileNameMatchTree struct {
 127	child matchTree
 128}
 129
 130// Don't visit this subtree for collecting matches.
 131type noVisitMatchTree struct {
 132	matchTree
 133}
 134
 135type regexpMatchTree struct {
 136	regexp *regexp.Regexp
 137
 138	fileName bool
 139
 140	// mutable
 141	reEvaluated bool
 142	found       []*candidateMatch
 143
 144	// nextDoc, prepare.
 145	bruteForceMatchTree
 146}
 147
 148// \bLITERAL\b
 149type wordMatchTree struct {
 150	word string
 151
 152	fileName bool
 153
 154	// mutable
 155	evaluated bool
 156	found     []*candidateMatch
 157
 158	// nextDoc, prepare.
 159	bruteForceMatchTree
 160}
 161
 162type substrMatchTree struct {
 163	matchIterator
 164
 165	query         *query.Substring
 166	caseSensitive bool
 167	fileName      bool
 168
 169	// mutable
 170	current       []*candidateMatch
 171	contEvaluated bool
 172}
 173
 174type branchQueryMatchTree struct {
 175	fileMasks []uint64
 176	masks     []uint64
 177	repos     []uint16
 178
 179	// mutable
 180	firstDone bool
 181	docID     uint32
 182}
 183
 184func (t *branchQueryMatchTree) branchMask() uint64 {
 185	return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
 186}
 187
 188type symbolRegexpMatchTree struct {
 189	matchTree
 190	regexp *regexp.Regexp
 191	all    bool // skips regex match if .*
 192
 193	reEvaluated bool
 194	found       []*candidateMatch
 195}
 196
 197func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 198	t.reEvaluated = false
 199}
 200
 201func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 202	if t.reEvaluated {
 203		return len(t.found) > 0, true
 204	}
 205
 206	if cost < costRegexp {
 207		return false, false
 208	}
 209
 210	sections := cp.docSections()
 211	content := cp.data(false)
 212
 213	found := t.found[:0]
 214	for i, sec := range sections {
 215		var idx []int
 216		if t.all {
 217			idx = []int{0, int(sec.End - sec.Start)}
 218		} else {
 219			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 220			if idx == nil {
 221				continue
 222			}
 223		}
 224
 225		cm := &candidateMatch{
 226			byteOffset:  sec.Start + uint32(idx[0]),
 227			byteMatchSz: uint32(idx[1] - idx[0]),
 228			symbol:      true,
 229			symbolIdx:   uint32(i),
 230		}
 231		found = append(found, cm)
 232	}
 233	t.found = found
 234	t.reEvaluated = true
 235
 236	return len(t.found) > 0, true
 237}
 238
 239type symbolSubstrMatchTree struct {
 240	*substrMatchTree
 241
 242	patternSize   uint32
 243	fileEndRunes  []uint32
 244	fileEndSymbol []uint32
 245
 246	doc      uint32
 247	sections []DocumentSection
 248
 249	secID uint32
 250}
 251
 252func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 253	t.substrMatchTree.prepare(doc)
 254	t.doc = doc
 255
 256	var fileStart uint32
 257	if doc > 0 {
 258		fileStart = t.fileEndRunes[doc-1]
 259	}
 260
 261	var sections []DocumentSection
 262	if len(t.sections) > 0 {
 263		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 264		if most == uint32(len(t.sections)) {
 265			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 266		} else {
 267			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 268				t.secID++
 269			}
 270
 271			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 272			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 273				symbolEnd++
 274			}
 275
 276			sections = t.sections[t.secID:symbolEnd]
 277		}
 278	}
 279
 280	secIdx := 0
 281	trimmed := t.current[:0]
 282	for len(sections) > secIdx && len(t.current) > 0 {
 283		start := fileStart + t.current[0].runeOffset
 284		end := start + t.patternSize
 285		if start >= sections[secIdx].End {
 286			secIdx++
 287			continue
 288		}
 289
 290		if start < sections[secIdx].Start {
 291			t.current = t.current[1:]
 292			continue
 293		}
 294
 295		if end <= sections[secIdx].End {
 296			t.current[0].symbol = true
 297			t.current[0].symbolIdx = uint32(secIdx)
 298			trimmed = append(trimmed, t.current[0])
 299		}
 300
 301		t.current = t.current[1:]
 302	}
 303	t.current = trimmed
 304}
 305
 306// all prepare methods
 307
 308func (t *bruteForceMatchTree) prepare(doc uint32) {
 309	t.docID = doc
 310	t.firstDone = true
 311}
 312
 313func (t *docMatchTree) prepare(doc uint32) {
 314	t.docID = doc
 315	t.firstDone = true
 316}
 317
 318func (t *andMatchTree) prepare(doc uint32) {
 319	for _, c := range t.children {
 320		c.prepare(doc)
 321	}
 322}
 323
 324func (t *regexpMatchTree) prepare(doc uint32) {
 325	t.found = t.found[:0]
 326	t.reEvaluated = false
 327	t.bruteForceMatchTree.prepare(doc)
 328}
 329
 330func (t *wordMatchTree) prepare(doc uint32) {
 331	t.found = t.found[:0]
 332	t.evaluated = false
 333	t.bruteForceMatchTree.prepare(doc)
 334}
 335
 336func (t *orMatchTree) prepare(doc uint32) {
 337	for _, c := range t.children {
 338		c.prepare(doc)
 339	}
 340}
 341
 342func (t *notMatchTree) prepare(doc uint32) {
 343	t.child.prepare(doc)
 344}
 345
 346func (t *fileNameMatchTree) prepare(doc uint32) {
 347	t.child.prepare(doc)
 348}
 349
 350func (t *substrMatchTree) prepare(nextDoc uint32) {
 351	t.matchIterator.prepare(nextDoc)
 352	t.current = t.matchIterator.candidates()
 353	t.contEvaluated = false
 354}
 355
 356func (t *branchQueryMatchTree) prepare(doc uint32) {
 357	t.firstDone = true
 358	t.docID = doc
 359}
 360
 361// nextDoc
 362
 363func (t *docMatchTree) nextDoc() uint32 {
 364	var start uint32
 365	if t.firstDone {
 366		start = t.docID + 1
 367	}
 368	for i := start; i < t.numDocs; i++ {
 369		if t.predicate(i) {
 370			return i
 371		}
 372	}
 373	return maxUInt32
 374}
 375
 376func (t *bruteForceMatchTree) nextDoc() uint32 {
 377	if !t.firstDone {
 378		return 0
 379	}
 380	return t.docID + 1
 381}
 382
 383func (t *andMatchTree) nextDoc() uint32 {
 384	var max uint32
 385	for _, c := range t.children {
 386		m := c.nextDoc()
 387		if m > max {
 388			max = m
 389		}
 390	}
 391	return max
 392}
 393
 394func (t *orMatchTree) nextDoc() uint32 {
 395	min := uint32(maxUInt32)
 396	for _, c := range t.children {
 397		m := c.nextDoc()
 398		if m < min {
 399			min = m
 400		}
 401	}
 402	return min
 403}
 404
 405func (t *notMatchTree) nextDoc() uint32 {
 406	return 0
 407}
 408
 409func (t *fileNameMatchTree) nextDoc() uint32 {
 410	return t.child.nextDoc()
 411}
 412
 413func (t *branchQueryMatchTree) nextDoc() uint32 {
 414	var start uint32
 415	if t.firstDone {
 416		start = t.docID + 1
 417	}
 418
 419	for i := start; i < uint32(len(t.fileMasks)); i++ {
 420		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 421			return i
 422		}
 423	}
 424	return maxUInt32
 425}
 426
 427// all String methods
 428
 429func (t *bruteForceMatchTree) String() string {
 430	return "all"
 431}
 432
 433func (t *docMatchTree) String() string {
 434	return fmt.Sprintf("doc(%s)", t.reason)
 435}
 436
 437func (t *andMatchTree) String() string {
 438	return fmt.Sprintf("and%v", t.children)
 439}
 440
 441func (t *regexpMatchTree) String() string {
 442	f := ""
 443	if t.fileName {
 444		f = "f"
 445	}
 446	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 447}
 448
 449func (t *wordMatchTree) String() string {
 450	f := ""
 451	if t.fileName {
 452		f = "f"
 453	}
 454	return fmt.Sprintf("%sword(%s)", f, t.word)
 455}
 456
 457func (t *orMatchTree) String() string {
 458	return fmt.Sprintf("or%v", t.children)
 459}
 460
 461func (t *notMatchTree) String() string {
 462	return fmt.Sprintf("not(%v)", t.child)
 463}
 464
 465func (t *noVisitMatchTree) String() string {
 466	return fmt.Sprintf("novisit(%v)", t.matchTree)
 467}
 468
 469func (t *fileNameMatchTree) String() string {
 470	return fmt.Sprintf("f(%v)", t.child)
 471}
 472
 473func (t *substrMatchTree) String() string {
 474	f := ""
 475	if t.fileName {
 476		f = "f"
 477	}
 478
 479	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 480}
 481
 482func (t *branchQueryMatchTree) String() string {
 483	return fmt.Sprintf("branch(%x)", t.masks)
 484}
 485
 486func (t *symbolSubstrMatchTree) String() string {
 487	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 488}
 489
 490func (t *symbolRegexpMatchTree) String() string {
 491	return fmt.Sprintf("symbol(%v)", t.matchTree)
 492}
 493
 494// visitMatches visits all atoms in matchTree. Note: This visits
 495// noVisitMatchTree. For collecting matches use visitMatches.
 496func visitMatchTree(t matchTree, f func(matchTree)) {
 497	switch s := t.(type) {
 498	case *andMatchTree:
 499		for _, ch := range s.children {
 500			visitMatchTree(ch, f)
 501		}
 502	case *orMatchTree:
 503		for _, ch := range s.children {
 504			visitMatchTree(ch, f)
 505		}
 506	case *andLineMatchTree:
 507		visitMatchTree(&s.andMatchTree, f)
 508	case *noVisitMatchTree:
 509		visitMatchTree(s.matchTree, f)
 510	case *notMatchTree:
 511		visitMatchTree(s.child, f)
 512	case *fileNameMatchTree:
 513		visitMatchTree(s.child, f)
 514	case *symbolSubstrMatchTree:
 515		visitMatchTree(s.substrMatchTree, f)
 516	case *symbolRegexpMatchTree:
 517		visitMatchTree(s.matchTree, f)
 518	default:
 519		f(t)
 520	}
 521}
 522
 523// updateMatchTreeStats calls updateStats on all atoms in mt which have that
 524// function defined.
 525func updateMatchTreeStats(mt matchTree, stats *Stats) {
 526	visitMatchTree(mt, func(mt matchTree) {
 527		if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
 528			atom.updateStats(stats)
 529		}
 530	})
 531}
 532
 533// visitMatches visits all atoms which can contribute matches. Note: This
 534// skips noVisitMatchTree.
 535func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 536	switch s := t.(type) {
 537	case *andMatchTree:
 538		for _, ch := range s.children {
 539			if known[ch] {
 540				visitMatches(ch, known, f)
 541			}
 542		}
 543	case *andLineMatchTree:
 544		visitMatches(&s.andMatchTree, known, f)
 545	case *orMatchTree:
 546		for _, ch := range s.children {
 547			if known[ch] {
 548				visitMatches(ch, known, f)
 549			}
 550		}
 551	case *symbolSubstrMatchTree:
 552		visitMatches(s.substrMatchTree, known, f)
 553	case *notMatchTree:
 554	case *noVisitMatchTree:
 555		// don't collect into negative trees.
 556	case *fileNameMatchTree:
 557		// We will just gather the filename if we do not visit this tree.
 558	default:
 559		f(s)
 560	}
 561}
 562
 563// all matches() methods.
 564
 565func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 566	return t.predicate(cp.idx), true
 567}
 568
 569func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 570	return true, true
 571}
 572
 573// andLineMatchTree is a performance optimization of andMatchTree. For content
 574// searches we don't want to run the regex engine if there is no line that
 575// contains matches from all terms.
 576func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 577	matches, sure := t.andMatchTree.matches(cp, cost, known)
 578	if !(sure && matches) {
 579		return matches, sure
 580	}
 581
 582	// find child with fewest candidates
 583	min := maxUInt32
 584	fewestChildren := 0
 585	for ix, child := range t.children {
 586		v, ok := child.(*substrMatchTree)
 587		// make sure we are running a content search and that all candidates are a
 588		// substrMatchTree
 589		if !ok || v.fileName {
 590			return matches, sure
 591		}
 592		if len(v.current) < min {
 593			min = len(v.current)
 594			fewestChildren = ix
 595		}
 596	}
 597
 598	type lineRange struct {
 599		start int
 600		end   int
 601	}
 602	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 603	prev := -1
 604	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 605		line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
 606		if line == prev {
 607			continue
 608		}
 609		prev = line
 610		lines = append(lines, lineRange{byteStart, byteEnd})
 611	}
 612
 613	// children keeps track of the children's candidates we have already seen.
 614	children := make([][]*candidateMatch, 0, len(t.children)-1)
 615	for j, child := range t.children {
 616		if j == fewestChildren {
 617			continue
 618		}
 619		children = append(children, child.(*substrMatchTree).current)
 620	}
 621
 622nextLine:
 623	for i := 0; i < len(lines); i++ {
 624		hits := 1
 625	nextChild:
 626		for j := range children {
 627		nextCandidate:
 628			for len(children[j]) > 0 {
 629				candidate := children[j][0]
 630				bo := int(cp.findOffset(false, candidate.runeOffset))
 631				if bo < lines[i].start {
 632					children[j] = children[j][1:]
 633					continue nextCandidate
 634				}
 635				if bo <= lines[i].end {
 636					hits++
 637					continue nextChild
 638				}
 639				// move the `lines` iterator forward until bo <= line.end
 640				for i < len(lines) && bo > lines[i].end {
 641					i++
 642				}
 643				i--
 644				continue nextLine
 645			}
 646		}
 647		// return early once we found any line that contains matches from all children
 648		if hits == len(t.children) {
 649			return matches, true
 650		}
 651	}
 652	return false, true
 653}
 654
 655func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 656	sure := true
 657
 658	for _, ch := range t.children {
 659		v, ok := evalMatchTree(cp, cost, known, ch)
 660		if ok && !v {
 661			return false, true
 662		}
 663		if !ok {
 664			sure = false
 665		}
 666	}
 667
 668	return true, sure
 669}
 670
 671func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 672	matches := false
 673	sure := true
 674	for _, ch := range t.children {
 675		v, ok := evalMatchTree(cp, cost, known, ch)
 676		if ok {
 677			// we could short-circuit, but we want to use
 678			// the other possibilities as a ranking
 679			// signal.
 680			matches = matches || v
 681		} else {
 682			sure = false
 683		}
 684	}
 685	return matches, sure
 686}
 687
 688func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 689	return t.branchMask() != 0, true
 690}
 691
 692func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 693	if t.reEvaluated {
 694		return len(t.found) > 0, true
 695	}
 696
 697	if cost < costRegexp {
 698		return false, false
 699	}
 700
 701	cp.stats.RegexpsConsidered++
 702	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 703	found := t.found[:0]
 704	for _, idx := range idxs {
 705		cm := &candidateMatch{
 706			byteOffset:  uint32(idx[0]),
 707			byteMatchSz: uint32(idx[1] - idx[0]),
 708			fileName:    t.fileName,
 709		}
 710
 711		found = append(found, cm)
 712	}
 713	t.found = found
 714	t.reEvaluated = true
 715
 716	return len(t.found) > 0, true
 717}
 718
 719func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 720	if t.evaluated {
 721		return len(t.found) > 0, true
 722	}
 723
 724	if cost < costRegexp {
 725		return false, false
 726	}
 727
 728	data := cp.data(t.fileName)
 729	offset := 0
 730	found := t.found[:0]
 731	for {
 732		idx := bytes.Index(data[offset:], []byte(t.word))
 733		if idx < 0 {
 734			break
 735		}
 736
 737		relStartOffset := offset + idx
 738		relEndOffset := relStartOffset + len(t.word)
 739
 740		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 741		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 742		if startBoundary && endBoundary {
 743			found = append(found, &candidateMatch{
 744				byteOffset:  uint32(offset + idx),
 745				byteMatchSz: uint32(len(t.word)),
 746				fileName:    t.fileName,
 747			})
 748		}
 749		offset += idx + len(t.word)
 750	}
 751
 752	t.found = found
 753	t.evaluated = true
 754
 755	return len(t.found) > 0, true
 756}
 757
 758// breakMatchesOnNewlines returns matches resulting from breaking each element
 759// of cms on newlines within text.
 760func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 761	var lineCMs []*candidateMatch
 762	for _, cm := range cms {
 763		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 764	}
 765	return lineCMs
 766}
 767
 768// breakOnNewlines returns matches resulting from breaking cm on newlines
 769// within text.
 770func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 771	var cms []*candidateMatch
 772	addMe := &candidateMatch{}
 773	*addMe = *cm
 774	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 775		if text[i] == '\n' {
 776			addMe.byteMatchSz = i - addMe.byteOffset
 777			if addMe.byteMatchSz != 0 {
 778				cms = append(cms, addMe)
 779			}
 780
 781			addMe = &candidateMatch{}
 782			*addMe = *cm
 783			addMe.byteOffset = i + 1
 784		}
 785	}
 786	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 787	if addMe.byteMatchSz != 0 {
 788		cms = append(cms, addMe)
 789	}
 790	return cms
 791}
 792
 793func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
 794	if v, ok := known[mt]; ok {
 795		return v, true
 796	}
 797
 798	v, ok := mt.matches(cp, cost, known)
 799	if ok {
 800		known[mt] = v
 801	}
 802
 803	return v, ok
 804}
 805
 806func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 807	v, ok := evalMatchTree(cp, cost, known, t.child)
 808	return !v, ok
 809}
 810
 811func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 812	return evalMatchTree(cp, cost, known, t.child)
 813}
 814
 815func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 816	if t.contEvaluated {
 817		return len(t.current) > 0, true
 818	}
 819
 820	if len(t.current) == 0 {
 821		return false, true
 822	}
 823
 824	if t.fileName && cost < costMemory {
 825		return false, false
 826	}
 827
 828	if !t.fileName && cost < costContent {
 829		return false, false
 830	}
 831
 832	pruned := t.current[:0]
 833	for _, m := range t.current {
 834		if m.byteOffset == 0 && m.runeOffset > 0 {
 835			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 836		}
 837		if m.matchContent(cp.data(m.fileName)) {
 838			pruned = append(pruned, m)
 839		}
 840	}
 841	t.current = pruned
 842	t.contEvaluated = true
 843
 844	return len(t.current) > 0, true
 845}
 846
 847type matchTreeOpt struct {
 848	// DisableWordMatchOptimization is used to disable the use of wordMatchTree.
 849	// This was added since we do not support wordMatchTree with symbol search.
 850	DisableWordMatchOptimization bool
 851}
 852
 853func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
 854	if q == nil {
 855		return nil, fmt.Errorf("got nil (sub)query")
 856	}
 857	switch s := q.(type) {
 858	case *query.Regexp:
 859		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 860		// superset of the regexp. If the returned matchTree is equivalent to the
 861		// original regexp, it returns true. An equivalent matchTree has the same
 862		// behaviour as the original regexp and can be used instead.
 863		//
 864		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 865		if err != nil {
 866			return nil, err
 867		}
 868		// if the query can be used in place of the regexp
 869		// return the subtree
 870		if isEq {
 871			return subMT, nil
 872		}
 873
 874		var tr matchTree
 875		if wmt, ok := regexpToWordMatchTree(s, opt); ok {
 876			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
 877			// provide something faster.
 878			tr = wmt
 879		} else {
 880			prefix := ""
 881			if !s.CaseSensitive {
 882				prefix = "(?i)"
 883			}
 884
 885			tr = &regexpMatchTree{
 886				regexp:   regexp.MustCompile(prefix + s.Regexp.String()),
 887				fileName: s.FileName,
 888			}
 889		}
 890
 891		return &andMatchTree{
 892			children: []matchTree{
 893				tr, &noVisitMatchTree{subMT},
 894			},
 895		}, nil
 896	case *query.And:
 897		var r []matchTree
 898		for _, ch := range s.Children {
 899			ct, err := d.newMatchTree(ch, opt)
 900			if err != nil {
 901				return nil, err
 902			}
 903			r = append(r, ct)
 904		}
 905		return &andMatchTree{r}, nil
 906	case *query.Or:
 907		var r []matchTree
 908		for _, ch := range s.Children {
 909			ct, err := d.newMatchTree(ch, opt)
 910			if err != nil {
 911				return nil, err
 912			}
 913			r = append(r, ct)
 914		}
 915		return &orMatchTree{r}, nil
 916	case *query.Not:
 917		ct, err := d.newMatchTree(s.Child, opt)
 918		return &notMatchTree{
 919			child: ct,
 920		}, err
 921
 922	case *query.Type:
 923		if s.Type != query.TypeFileName {
 924			break
 925		}
 926
 927		ct, err := d.newMatchTree(s.Child, opt)
 928		if err != nil {
 929			return nil, err
 930		}
 931
 932		return &fileNameMatchTree{
 933			child: ct,
 934		}, nil
 935
 936	case *query.Substring:
 937		return d.newSubstringMatchTree(s)
 938
 939	case *query.Branch:
 940		masks := make([]uint64, 0, len(d.repoMetaData))
 941		if s.Pattern == "HEAD" {
 942			for i := 0; i < len(d.repoMetaData); i++ {
 943				masks = append(masks, 1)
 944			}
 945		} else {
 946			for _, branchIDs := range d.branchIDs {
 947				mask := uint64(0)
 948				for nm, m := range branchIDs {
 949					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
 950						mask |= uint64(m)
 951					}
 952				}
 953				masks = append(masks, mask)
 954			}
 955
 956		}
 957		return &branchQueryMatchTree{
 958			masks:     masks,
 959			fileMasks: d.fileBranchMasks,
 960			repos:     d.repos,
 961		}, nil
 962	case *query.Const:
 963		if s.Value {
 964			return &bruteForceMatchTree{}, nil
 965		} else {
 966			return &noMatchTree{Why: "const"}, nil
 967		}
 968	case *query.Language:
 969		code, ok := d.metaData.LanguageMap[s.Language]
 970		if !ok {
 971			return &noMatchTree{Why: "lang"}, nil
 972		}
 973		return &docMatchTree{
 974			reason:  "language",
 975			numDocs: d.numDocs(),
 976			predicate: func(docID uint32) bool {
 977				return d.getLanguage(docID) == code
 978			},
 979		}, nil
 980
 981	case *query.Symbol:
 982		// Disable WordMatchTree since we don't support it in symbols yet.
 983		optCopy := opt
 984		optCopy.DisableWordMatchOptimization = true
 985
 986		subMT, err := d.newMatchTree(s.Expr, optCopy)
 987		if err != nil {
 988			return nil, err
 989		}
 990
 991		if substr, ok := subMT.(*substrMatchTree); ok {
 992			return &symbolSubstrMatchTree{
 993				substrMatchTree: substr,
 994				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
 995				fileEndRunes:    d.fileEndRunes,
 996				fileEndSymbol:   d.fileEndSymbol,
 997				sections:        d.runeDocSections,
 998			}, nil
 999		}
1000
1001		var regexp *regexp.Regexp
1002		visitMatchTree(subMT, func(mt matchTree) {
1003			if t, ok := mt.(*regexpMatchTree); ok {
1004				regexp = t.regexp
1005			}
1006		})
1007		if regexp == nil {
1008			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1009		}
1010
1011		return &symbolRegexpMatchTree{
1012			regexp:    regexp,
1013			all:       regexp.String() == "(?i)(?-s:.)*",
1014			matchTree: subMT,
1015		}, nil
1016
1017	case *query.FileNameSet:
1018		return &docMatchTree{
1019			reason:  "FileNameSet",
1020			numDocs: d.numDocs(),
1021			predicate: func(docID uint32) bool {
1022				fileName := d.fileName(docID)
1023				_, ok := s.Set[string(fileName)]
1024				return ok
1025			},
1026		}, nil
1027
1028	case *query.BranchesRepos:
1029		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1030		for repoIdx := range d.repoMetaData {
1031			var mask uint64
1032			for _, br := range s.List {
1033				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1034					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1035				}
1036			}
1037			reposBranchesWant[repoIdx] = mask
1038		}
1039		return &docMatchTree{
1040			reason:  "BranchesRepos",
1041			numDocs: d.numDocs(),
1042			predicate: func(docID uint32) bool {
1043				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1044			},
1045		}, nil
1046
1047	case *query.RepoSet:
1048		reposWant := make([]bool, len(d.repoMetaData))
1049		for repoIdx, r := range d.repoMetaData {
1050			if _, ok := s.Set[r.Name]; ok {
1051				reposWant[repoIdx] = true
1052			}
1053		}
1054		return &docMatchTree{
1055			reason:  "RepoSet",
1056			numDocs: d.numDocs(),
1057			predicate: func(docID uint32) bool {
1058				return reposWant[d.repos[docID]]
1059			},
1060		}, nil
1061
1062	case *query.RepoIDs:
1063		reposWant := make([]bool, len(d.repoMetaData))
1064		for repoIdx, r := range d.repoMetaData {
1065			if s.Repos.Contains(r.ID) {
1066				reposWant[repoIdx] = true
1067			}
1068		}
1069		return &docMatchTree{
1070			reason:  "RepoIDs",
1071			numDocs: d.numDocs(),
1072			predicate: func(docID uint32) bool {
1073				return reposWant[d.repos[docID]]
1074			},
1075		}, nil
1076
1077	case *query.Repo:
1078		reposWant := make([]bool, len(d.repoMetaData))
1079		for repoIdx, r := range d.repoMetaData {
1080			if s.Regexp.MatchString(r.Name) {
1081				reposWant[repoIdx] = true
1082			}
1083		}
1084		return &docMatchTree{
1085			reason:  "Repo",
1086			numDocs: d.numDocs(),
1087			predicate: func(docID uint32) bool {
1088				return reposWant[d.repos[docID]]
1089			},
1090		}, nil
1091
1092	case *query.RepoRegexp:
1093		reposWant := make([]bool, len(d.repoMetaData))
1094		for repoIdx, r := range d.repoMetaData {
1095			if s.Regexp.MatchString(r.Name) {
1096				reposWant[repoIdx] = true
1097			}
1098		}
1099		return &docMatchTree{
1100			reason:  "RepoRegexp",
1101			numDocs: d.numDocs(),
1102			predicate: func(docID uint32) bool {
1103				return reposWant[d.repos[docID]]
1104			},
1105		}, nil
1106
1107	case query.RawConfig:
1108		return &docMatchTree{
1109			reason:  s.String(),
1110			numDocs: d.numDocs(),
1111			predicate: func(docID uint32) bool {
1112				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1113			},
1114		}, nil
1115	}
1116	log.Panicf("type %T", q)
1117	return nil, nil
1118}
1119
1120func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1121	st := &substrMatchTree{
1122		query:         s,
1123		caseSensitive: s.CaseSensitive,
1124		fileName:      s.FileName,
1125	}
1126
1127	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1128		prefix := ""
1129		if !s.CaseSensitive {
1130			prefix = "(?i)"
1131		}
1132		t := &regexpMatchTree{
1133			regexp:   regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1134			fileName: s.FileName,
1135		}
1136		return t, nil
1137	}
1138
1139	result, err := d.iterateNgrams(s)
1140	if err != nil {
1141		return nil, err
1142	}
1143	st.matchIterator = result
1144	return st, nil
1145}
1146
1147func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1148	if opt.DisableWordMatchOptimization {
1149		return nil, false
1150	}
1151	// Needs to be case sensitive
1152	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1153		return nil, false
1154	}
1155	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1156	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1157		return nil, false
1158	}
1159	sub := q.Regexp.Sub
1160	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1161		return nil, false
1162	}
1163
1164	return &wordMatchTree{
1165		word:     string(sub[1].Rune),
1166		fileName: q.FileName,
1167	}, true
1168}
1169
1170// pruneMatchTree removes impossible branches from the matchTree, as indicated
1171// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1172func pruneMatchTree(mt matchTree) (matchTree, error) {
1173	var err error
1174	switch mt := mt.(type) {
1175	// leaf nodes that we test with the filter:
1176	case *substrMatchTree:
1177		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1178			if _, ok := res.matchIterator.(*noMatchTree); ok {
1179				return nil, nil
1180			}
1181		}
1182	// recursive tree structures:
1183	case *andMatchTree:
1184		// Any branch of an and becoming impossible means the entire clause
1185		// is impossible. Otherwise, just handle rewrites.
1186		for i, child := range mt.children {
1187			newChild, err := pruneMatchTree(child)
1188			if err != nil {
1189				return nil, err
1190			}
1191			if newChild == nil {
1192				return nil, nil
1193			}
1194			mt.children[i] = newChild
1195		}
1196	case *orMatchTree:
1197		// *All* branches of an OR becoming impossible means the entire clause
1198		// is impossible. Otherwise, drop impossible subclauses and handle
1199		// rewrites, including simplifying to a singular resulting child branch.
1200		n := 0
1201		for _, child := range mt.children {
1202			newChild, err := pruneMatchTree(child)
1203			if err != nil {
1204				return nil, err
1205			}
1206			if newChild != nil {
1207				mt.children[n] = newChild
1208				n++
1209			}
1210		}
1211		mt.children = mt.children[:n]
1212		if len(mt.children) == 1 {
1213			return mt.children[0], nil
1214		} else if len(mt.children) == 0 {
1215			return nil, nil
1216		}
1217	case *noVisitMatchTree:
1218		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1219		if err != nil {
1220			return nil, err
1221		}
1222		if mt.matchTree == nil {
1223			return nil, nil
1224		}
1225	case *fileNameMatchTree:
1226		mt.child, err = pruneMatchTree(mt.child)
1227	case *andLineMatchTree:
1228		child, err := pruneMatchTree(&mt.andMatchTree)
1229		if err != nil {
1230			return nil, err
1231		}
1232		if child == nil {
1233			return nil, nil
1234		}
1235		if c, ok := child.(*andMatchTree); ok {
1236			mt.andMatchTree = *c
1237		} else {
1238			// the and was simplified to a single clause,
1239			// so the linematch portion is irrelevant.
1240			return mt, nil
1241		}
1242	case *notMatchTree:
1243		mt.child, err = pruneMatchTree(mt.child)
1244		if err != nil {
1245			return nil, err
1246		}
1247		if mt.child == nil {
1248			// not false => true
1249			return &bruteForceMatchTree{}, nil
1250		}
1251	// unhandled:
1252	case *docMatchTree:
1253	case *bruteForceMatchTree:
1254	case *regexpMatchTree:
1255	case *wordMatchTree:
1256	}
1257	return mt, err
1258}
Configure Feed

Configure Feed