index/matchtree.go at 75879a727dd21041fd8485106bf93e75be5ef028 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / matchtree.go
at 75879a727dd21041fd8485106bf93e75be5ef028 34 kB View raw
Julie Tibshirani Fix compilation on 32 bit architectures (#936) 1y ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package index
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"math"
  22	"regexp/syntax"
  23	"strings"
  24	"unicode/utf8"
  25
  26	"github.com/grafana/regexp"
  27	"github.com/sourcegraph/zoekt"
  28	"github.com/sourcegraph/zoekt/internal/syntaxutil"
  29	"github.com/sourcegraph/zoekt/query"
  30)
  31
  32// A docIterator iterates over documents in order.
  33type docIterator interface {
  34	// provide the next document where we may find something interesting.
  35	//
  36	// This is like a "peek" and shouldn't mutate state. prepare is what should
  37	// change state.
  38	nextDoc() uint32
  39
  40	// clears any per-document state of the docIterator, and
  41	// prepares for evaluating the given doc. The argument is
  42	// strictly increasing over time.
  43	prepare(nextDoc uint32)
  44}
  45
  46// costs are passed in increasing order to matchTree.matches until they do not
  47// return matchesRequiresHigherCost.
  48const (
  49	costConst   = 0
  50	costMemory  = 1
  51	costContent = 2
  52	costRegexp  = 3
  53)
  54
  55const (
  56	costMin = costConst
  57	costMax = costRegexp
  58)
  59
  60// matchesState is an enum for the state of a matchTree after a call to
  61// matchTree.matches.
  62type matchesState uint8
  63
  64const (
  65	// matchesRequiresHigherCost is returned when matchTree.matches hasn't done
  66	// a search yet since the cost value is not high enough.
  67	matchesRequiresHigherCost matchesState = iota
  68
  69	// matchesFound is returned when matchTree.matches has done a search and
  70	// found one or more matches.
  71	matchesFound
  72
  73	// matchesNone is returned when matchTree.matches has done a search and
  74	// found nothing.
  75	matchesNone
  76)
  77
  78// matchesStatePred is a helper which returns matchesFound if b is true
  79// otherwise returns matchesNone.
  80func matchesStatePred(b bool) matchesState {
  81	if b {
  82		return matchesFound
  83	}
  84	return matchesNone
  85}
  86
  87// matchesStateForSlice is a helper which returns matchesFound if v is
  88// non-empty otherwise returns matchesNone.
  89func matchesStateForSlice[T any](v []T) matchesState {
  90	return matchesStatePred(len(v) > 0)
  91}
  92
  93// An expression tree coupled with matches. The matchtree has two
  94// functions:
  95//
  96// * it implements boolean combinations (and, or, not)
  97//
  98// * it implements shortcuts, where we skip documents (for example: if
  99// there are no trigram matches, we can be sure there are no substring
 100// matches). The matchtree iterates over the documents as they are
 101// ordered in the shard.
 102//
 103// The general process for a given (shard, query) is:
 104//
 105// - construct matchTree for the query
 106//
 107// - find all different leaf matchTrees (substring, regexp, etc.)
 108//
 109// in a loop:
 110//
 111//   - find next doc to process using nextDoc
 112//
 113//   - evaluate atoms (leaf expressions that match text)
 114//
 115//   - evaluate the tree using matches(), storing the result in map.
 116//
 117//   - if the complete tree returns (matches() != matchesRequiresHigherCost)
 118//     for the document, collect all text matches by looking at leaf
 119//     matchTrees.
 120type matchTree interface {
 121	docIterator
 122
 123	// matches if cost is high enough. See documentation for matchesState's
 124	// values.
 125	//
 126	// Note: Do not call this directly, rather use evalMatchTree which uses
 127	// known to cache responses once the state transitions away from
 128	// matchesRequiresHigherCost.
 129	matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
 130}
 131
 132// docMatchTree iterates over documents for which predicate(docID) returns true.
 133type docMatchTree struct {
 134	// the number of documents in a shard.
 135	numDocs uint32
 136
 137	predicate func(docID uint32) bool
 138
 139	// provides additional information about the reason why the docMatchTree was
 140	// created.
 141	reason string
 142
 143	// mutable
 144	firstDone bool
 145	docID     uint32
 146}
 147
 148type bruteForceMatchTree struct {
 149	// mutable
 150	firstDone bool
 151	docID     uint32
 152}
 153
 154type andLineMatchTree struct {
 155	andMatchTree
 156}
 157
 158type andMatchTree struct {
 159	children []matchTree
 160}
 161
 162type orMatchTree struct {
 163	children []matchTree
 164}
 165
 166type notMatchTree struct {
 167	child matchTree
 168}
 169
 170// Returns only the filename of child matches.
 171type fileNameMatchTree struct {
 172	child matchTree
 173}
 174
 175type boostMatchTree struct {
 176	child matchTree
 177	boost float64
 178}
 179
 180// Don't visit this subtree for collecting matches.
 181type noVisitMatchTree struct {
 182	matchTree
 183}
 184
 185type regexpMatchTree struct {
 186	regexp *regexp.Regexp
 187
 188	// origRegexp is the original parsed regexp from the query structure. It
 189	// does not include mutations such as case sensitivity.
 190	origRegexp *syntax.Regexp
 191
 192	fileName bool
 193
 194	// mutable
 195	reEvaluated bool
 196	found       []*candidateMatch
 197
 198	// nextDoc, prepare.
 199	bruteForceMatchTree
 200}
 201
 202func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
 203	prefix := ""
 204	if !s.CaseSensitive {
 205		prefix = "(?i)"
 206	}
 207
 208	return &regexpMatchTree{
 209		regexp:     regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)),
 210		origRegexp: s.Regexp,
 211		fileName:   s.FileName,
 212	}
 213}
 214
 215// \bLITERAL\b
 216type wordMatchTree struct {
 217	word string
 218
 219	fileName bool
 220
 221	// mutable
 222	evaluated bool
 223	found     []*candidateMatch
 224
 225	// nextDoc, prepare.
 226	bruteForceMatchTree
 227}
 228
 229type substrMatchTree struct {
 230	matchIterator
 231
 232	query         *query.Substring
 233	caseSensitive bool
 234	fileName      bool
 235
 236	// mutable
 237	current       []*candidateMatch
 238	contEvaluated bool
 239}
 240
 241type branchQueryMatchTree struct {
 242	fileMasks []uint64
 243	masks     []uint64
 244	repos     []uint16
 245
 246	// mutable
 247	firstDone bool
 248	docID     uint32
 249}
 250
 251func (t *branchQueryMatchTree) branchMask() uint64 {
 252	return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
 253}
 254
 255type symbolRegexpMatchTree struct {
 256	matchTree
 257	regexp *regexp.Regexp
 258	all    bool // skips regex match if .*
 259
 260	reEvaluated bool
 261	found       []*candidateMatch
 262}
 263
 264func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 265	t.reEvaluated = false
 266	t.found = t.found[:0]
 267	t.matchTree.prepare(doc)
 268}
 269
 270func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 271	if t.reEvaluated {
 272		return matchesStateForSlice(t.found)
 273	}
 274
 275	if cost < costRegexp {
 276		return matchesRequiresHigherCost
 277	}
 278
 279	sections := cp.docSections()
 280	content := cp.data(false)
 281
 282	found := t.found[:0]
 283	for i, sec := range sections {
 284		var idx []int
 285		if t.all {
 286			idx = []int{0, int(sec.End - sec.Start)}
 287		} else {
 288			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 289			if idx == nil {
 290				continue
 291			}
 292		}
 293
 294		cm := &candidateMatch{
 295			byteOffset:  sec.Start + uint32(idx[0]),
 296			byteMatchSz: uint32(idx[1] - idx[0]),
 297			symbol:      true,
 298			symbolIdx:   uint32(i),
 299		}
 300		found = append(found, cm)
 301	}
 302	t.found = found
 303	t.reEvaluated = true
 304
 305	return matchesStateForSlice(t.found)
 306}
 307
 308type symbolSubstrMatchTree struct {
 309	*substrMatchTree
 310
 311	patternSize   uint32
 312	fileEndRunes  []uint32
 313	fileEndSymbol []uint32
 314
 315	doc      uint32
 316	sections []DocumentSection
 317
 318	secID uint32
 319}
 320
 321func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 322	t.substrMatchTree.prepare(doc)
 323	t.doc = doc
 324
 325	var fileStart uint32
 326	if doc > 0 {
 327		fileStart = t.fileEndRunes[doc-1]
 328	}
 329
 330	var sections []DocumentSection
 331	if len(t.sections) > 0 {
 332		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 333		if most == uint32(len(t.sections)) {
 334			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 335		} else {
 336			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 337				t.secID++
 338			}
 339
 340			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 341			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 342				symbolEnd++
 343			}
 344
 345			sections = t.sections[t.secID:symbolEnd]
 346		}
 347	}
 348
 349	secIdx := 0
 350	trimmed := t.current[:0]
 351	for len(sections) > secIdx && len(t.current) > 0 {
 352		start := fileStart + t.current[0].runeOffset
 353		end := start + t.patternSize
 354		if start >= sections[secIdx].End {
 355			secIdx++
 356			continue
 357		}
 358
 359		if start < sections[secIdx].Start {
 360			t.current = t.current[1:]
 361			continue
 362		}
 363
 364		if end <= sections[secIdx].End {
 365			t.current[0].symbol = true
 366			t.current[0].symbolIdx = uint32(secIdx)
 367			trimmed = append(trimmed, t.current[0])
 368		}
 369
 370		t.current = t.current[1:]
 371	}
 372	t.current = trimmed
 373}
 374
 375// all prepare methods
 376
 377func (t *bruteForceMatchTree) prepare(doc uint32) {
 378	t.docID = doc
 379	t.firstDone = true
 380}
 381
 382func (t *docMatchTree) prepare(doc uint32) {
 383	t.docID = doc
 384	t.firstDone = true
 385}
 386
 387func (t *andMatchTree) prepare(doc uint32) {
 388	for _, c := range t.children {
 389		c.prepare(doc)
 390	}
 391}
 392
 393func (t *regexpMatchTree) prepare(doc uint32) {
 394	t.found = t.found[:0]
 395	t.reEvaluated = false
 396	t.bruteForceMatchTree.prepare(doc)
 397}
 398
 399func (t *wordMatchTree) prepare(doc uint32) {
 400	t.found = t.found[:0]
 401	t.evaluated = false
 402	t.bruteForceMatchTree.prepare(doc)
 403}
 404
 405func (t *orMatchTree) prepare(doc uint32) {
 406	for _, c := range t.children {
 407		c.prepare(doc)
 408	}
 409}
 410
 411func (t *notMatchTree) prepare(doc uint32) {
 412	t.child.prepare(doc)
 413}
 414
 415func (t *fileNameMatchTree) prepare(doc uint32) {
 416	t.child.prepare(doc)
 417}
 418
 419func (t *boostMatchTree) prepare(doc uint32) {
 420	t.child.prepare(doc)
 421}
 422
 423func (t *substrMatchTree) prepare(nextDoc uint32) {
 424	t.matchIterator.prepare(nextDoc)
 425	t.current = t.matchIterator.candidates()
 426	t.contEvaluated = false
 427}
 428
 429func (t *branchQueryMatchTree) prepare(doc uint32) {
 430	t.firstDone = true
 431	t.docID = doc
 432}
 433
 434// nextDoc
 435
 436func (t *docMatchTree) nextDoc() uint32 {
 437	var start uint32
 438	if t.firstDone {
 439		start = t.docID + 1
 440	}
 441	for i := start; i < t.numDocs; i++ {
 442		if t.predicate(i) {
 443			return i
 444		}
 445	}
 446	return math.MaxUint32
 447}
 448
 449func (t *bruteForceMatchTree) nextDoc() uint32 {
 450	if !t.firstDone {
 451		return 0
 452	}
 453	return t.docID + 1
 454}
 455
 456func (t *andMatchTree) nextDoc() uint32 {
 457	var max uint32
 458	for _, c := range t.children {
 459		m := c.nextDoc()
 460		if m > max {
 461			max = m
 462		}
 463	}
 464	return max
 465}
 466
 467func (t *orMatchTree) nextDoc() uint32 {
 468	min := uint32(math.MaxUint32)
 469	for _, c := range t.children {
 470		m := c.nextDoc()
 471		if m < min {
 472			min = m
 473		}
 474	}
 475	return min
 476}
 477
 478func (t *notMatchTree) nextDoc() uint32 {
 479	return 0
 480}
 481
 482func (t *fileNameMatchTree) nextDoc() uint32 {
 483	return t.child.nextDoc()
 484}
 485
 486func (t *boostMatchTree) nextDoc() uint32 {
 487	return t.child.nextDoc()
 488}
 489
 490func (t *branchQueryMatchTree) nextDoc() uint32 {
 491	var start uint32
 492	if t.firstDone {
 493		start = t.docID + 1
 494	}
 495
 496	for i := start; i < uint32(len(t.fileMasks)); i++ {
 497		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 498			return i
 499		}
 500	}
 501	return math.MaxUint32
 502}
 503
 504// all String methods
 505
 506func (t *bruteForceMatchTree) String() string {
 507	return "all"
 508}
 509
 510func (t *docMatchTree) String() string {
 511	return fmt.Sprintf("doc(%s)", t.reason)
 512}
 513
 514func (t *andMatchTree) String() string {
 515	return fmt.Sprintf("and%v", t.children)
 516}
 517
 518func (t *regexpMatchTree) String() string {
 519	f := ""
 520	if t.fileName {
 521		f = "f"
 522	}
 523	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 524}
 525
 526func (t *wordMatchTree) String() string {
 527	f := ""
 528	if t.fileName {
 529		f = "f"
 530	}
 531	return fmt.Sprintf("%sword(%s)", f, t.word)
 532}
 533
 534func (t *orMatchTree) String() string {
 535	return fmt.Sprintf("or%v", t.children)
 536}
 537
 538func (t *notMatchTree) String() string {
 539	return fmt.Sprintf("not(%v)", t.child)
 540}
 541
 542func (t *noVisitMatchTree) String() string {
 543	return fmt.Sprintf("novisit(%v)", t.matchTree)
 544}
 545
 546func (t *fileNameMatchTree) String() string {
 547	return fmt.Sprintf("f(%v)", t.child)
 548}
 549
 550func (t *boostMatchTree) String() string {
 551	return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
 552}
 553
 554func (t *substrMatchTree) String() string {
 555	f := ""
 556	if t.fileName {
 557		f = "f"
 558	}
 559
 560	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 561}
 562
 563func (t *branchQueryMatchTree) String() string {
 564	return fmt.Sprintf("branch(%x)", t.masks)
 565}
 566
 567func (t *symbolSubstrMatchTree) String() string {
 568	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 569}
 570
 571func (t *symbolRegexpMatchTree) String() string {
 572	return fmt.Sprintf("symbol(%v)", t.matchTree)
 573}
 574
 575// visitMatches visits all atoms in matchTree. Note: This visits
 576// noVisitMatchTree. For collecting matches use visitMatches.
 577func visitMatchTree(t matchTree, f func(matchTree)) {
 578	switch s := t.(type) {
 579	case *andMatchTree:
 580		for _, ch := range s.children {
 581			visitMatchTree(ch, f)
 582		}
 583	case *orMatchTree:
 584		for _, ch := range s.children {
 585			visitMatchTree(ch, f)
 586		}
 587	case *andLineMatchTree:
 588		visitMatchTree(&s.andMatchTree, f)
 589	case *noVisitMatchTree:
 590		visitMatchTree(s.matchTree, f)
 591	case *notMatchTree:
 592		visitMatchTree(s.child, f)
 593	case *fileNameMatchTree:
 594		visitMatchTree(s.child, f)
 595	case *boostMatchTree:
 596		visitMatchTree(s.child, f)
 597	case *symbolSubstrMatchTree:
 598		visitMatchTree(s.substrMatchTree, f)
 599	case *symbolRegexpMatchTree:
 600		visitMatchTree(s.matchTree, f)
 601	default:
 602		f(t)
 603	}
 604}
 605
 606// updateMatchTreeStats calls updateStats on all atoms in mt which have that
 607// function defined.
 608func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) {
 609	visitMatchTree(mt, func(mt matchTree) {
 610		if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok {
 611			atom.updateStats(stats)
 612		}
 613	})
 614}
 615
 616func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 617	visitMatches(t, known, 1, func(mt matchTree, _ float64) {
 618		f(mt)
 619	})
 620}
 621
 622// visitMatches visits all atoms which can contribute matches. Note: This
 623// skips noVisitMatchTree.
 624func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
 625	switch s := t.(type) {
 626	case *andMatchTree:
 627		for _, ch := range s.children {
 628			if known[ch] {
 629				visitMatches(ch, known, weight, f)
 630			}
 631		}
 632	case *andLineMatchTree:
 633		visitMatches(&s.andMatchTree, known, weight, f)
 634	case *orMatchTree:
 635		for _, ch := range s.children {
 636			if known[ch] {
 637				visitMatches(ch, known, weight, f)
 638			}
 639		}
 640	case *boostMatchTree:
 641		visitMatches(s.child, known, weight*s.boost, f)
 642	case *symbolSubstrMatchTree:
 643		visitMatches(s.substrMatchTree, known, weight, f)
 644	case *notMatchTree:
 645	case *noVisitMatchTree:
 646		// don't collect into negative trees.
 647	case *fileNameMatchTree:
 648		// We will just gather the filename if we do not visit this tree.
 649	default:
 650		f(s, weight)
 651	}
 652}
 653
 654// all matches() methods.
 655
 656func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 657	return matchesStatePred(t.predicate(cp.idx))
 658}
 659
 660func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 661	return matchesFound
 662}
 663
 664// andLineMatchTree is a performance optimization of andMatchTree. For content
 665// searches we don't want to run the regex engine if there is no line that
 666// contains matches from all terms.
 667func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 668	if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
 669		return state
 670	}
 671
 672	// Invariant: all children have matches. If any line contains all of them we
 673	// can return MatchesFound.
 674
 675	// find child with fewest candidates
 676	minCount := math.MaxInt
 677	fewestChildren := 0
 678	for ix, child := range t.children {
 679		v, ok := child.(*substrMatchTree)
 680		// make sure we are running a content search and that all candidates are a
 681		// substrMatchTree
 682		if !ok || v.fileName {
 683			return matchesFound
 684		}
 685		if len(v.current) < minCount {
 686			minCount = len(v.current)
 687			fewestChildren = ix
 688		}
 689	}
 690
 691	type lineRange struct {
 692		start int
 693		end   int
 694	}
 695	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 696	prev := -1
 697	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 698		line := cp.newlines().atOffset(candidate.byteOffset)
 699		if line == prev {
 700			continue
 701		}
 702		prev = line
 703		byteStart := int(cp.newlines().lineStart(line))
 704		byteEnd := int(cp.newlines().lineStart(line + 1))
 705		lines = append(lines, lineRange{byteStart, byteEnd})
 706	}
 707
 708	// children keeps track of the children's candidates we have already seen.
 709	children := make([][]*candidateMatch, 0, len(t.children)-1)
 710	for j, child := range t.children {
 711		if j == fewestChildren {
 712			continue
 713		}
 714		children = append(children, child.(*substrMatchTree).current)
 715	}
 716
 717nextLine:
 718	for i := 0; i < len(lines); i++ {
 719		hits := 1
 720	nextChild:
 721		for j := range children {
 722		nextCandidate:
 723			for len(children[j]) > 0 {
 724				candidate := children[j][0]
 725				bo := int(cp.findOffset(false, candidate.runeOffset))
 726				if bo < lines[i].start {
 727					children[j] = children[j][1:]
 728					continue nextCandidate
 729				}
 730				if bo < lines[i].end {
 731					hits++
 732					continue nextChild
 733				}
 734				// move the `lines` iterator forward until bo < line.end
 735				for i < len(lines) && bo >= lines[i].end {
 736					i++
 737				}
 738				i--
 739				continue nextLine
 740			}
 741		}
 742		// return early once we found any line that contains matches from all children
 743		if hits == len(t.children) {
 744			return matchesFound
 745		}
 746	}
 747	return matchesNone
 748}
 749
 750func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 751	// We have found matches unless a child needs to do more work or it hasn't
 752	// found matches.
 753	state := matchesFound
 754
 755	for _, ch := range t.children {
 756		switch evalMatchTree(cp, cost, known, ch) {
 757		case matchesRequiresHigherCost:
 758			// keep evaluating other children incase we come across matchesNone
 759			state = matchesRequiresHigherCost
 760		case matchesFound:
 761			// will return this if every child has this value
 762		case matchesNone:
 763			return matchesNone
 764		}
 765	}
 766
 767	return state
 768}
 769
 770func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 771	// we could short-circuit, but we want to use the other possibilities as a
 772	// ranking signal. So we always return the most conservative state.
 773	state := matchesNone
 774	for _, ch := range t.children {
 775		switch evalMatchTree(cp, cost, known, ch) {
 776		case matchesRequiresHigherCost:
 777			state = matchesRequiresHigherCost
 778		case matchesFound:
 779			if state != matchesRequiresHigherCost {
 780				state = matchesFound
 781			}
 782		case matchesNone:
 783			// noop
 784		}
 785	}
 786	return state
 787}
 788
 789func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 790	return matchesStatePred(t.branchMask() != 0)
 791}
 792
 793func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 794	if t.reEvaluated {
 795		return matchesStateForSlice(t.found)
 796	}
 797
 798	if cost < costRegexp {
 799		return matchesRequiresHigherCost
 800	}
 801
 802	cp.stats.RegexpsConsidered++
 803	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 804	found := t.found[:0]
 805	for _, idx := range idxs {
 806		cm := &candidateMatch{
 807			byteOffset:  uint32(idx[0]),
 808			byteMatchSz: uint32(idx[1] - idx[0]),
 809			fileName:    t.fileName,
 810		}
 811
 812		found = append(found, cm)
 813	}
 814	t.found = found
 815	t.reEvaluated = true
 816
 817	return matchesStateForSlice(t.found)
 818}
 819
 820func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 821	if t.evaluated {
 822		return matchesStateForSlice(t.found)
 823	}
 824
 825	if cost < costRegexp {
 826		return matchesRequiresHigherCost
 827	}
 828
 829	data := cp.data(t.fileName)
 830	offset := 0
 831	found := t.found[:0]
 832	for {
 833		idx := bytes.Index(data[offset:], []byte(t.word))
 834		if idx < 0 {
 835			break
 836		}
 837
 838		relStartOffset := offset + idx
 839		relEndOffset := relStartOffset + len(t.word)
 840
 841		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 842		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 843		if startBoundary && endBoundary {
 844			found = append(found, &candidateMatch{
 845				byteOffset:  uint32(offset + idx),
 846				byteMatchSz: uint32(len(t.word)),
 847				fileName:    t.fileName,
 848			})
 849		}
 850		offset += idx + len(t.word)
 851	}
 852
 853	t.found = found
 854	t.evaluated = true
 855
 856	return matchesStateForSlice(t.found)
 857}
 858
 859// breakMatchesOnNewlines returns matches resulting from breaking each element
 860// of cms on newlines within text.
 861func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 862	var lineCMs []*candidateMatch
 863	for _, cm := range cms {
 864		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 865	}
 866	return lineCMs
 867}
 868
 869// breakOnNewlines returns matches resulting from breaking cm on newlines
 870// within text.
 871func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 872	var cms []*candidateMatch
 873	addMe := &candidateMatch{}
 874	*addMe = *cm
 875	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 876		if text[i] == '\n' {
 877			addMe.byteMatchSz = i - addMe.byteOffset
 878			if addMe.byteMatchSz != 0 {
 879				cms = append(cms, addMe)
 880			}
 881
 882			addMe = &candidateMatch{}
 883			*addMe = *cm
 884			addMe.byteOffset = i + 1
 885		}
 886	}
 887	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 888	if addMe.byteMatchSz != 0 {
 889		cms = append(cms, addMe)
 890	}
 891	return cms
 892}
 893
 894// evalMatchTree should be called instead of directly calling
 895// matchTree.matches. It cache known values for future evaluation at higher
 896// costs.
 897func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
 898	if v, ok := known[mt]; ok {
 899		return matchesStatePred(v)
 900	}
 901
 902	ms := mt.matches(cp, cost, known)
 903	if ms != matchesRequiresHigherCost {
 904		known[mt] = ms == matchesFound
 905	}
 906
 907	return ms
 908}
 909
 910func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 911	switch evalMatchTree(cp, cost, known, t.child) {
 912	case matchesRequiresHigherCost:
 913		return matchesRequiresHigherCost
 914	case matchesFound:
 915		return matchesNone
 916	case matchesNone:
 917		return matchesFound
 918	default:
 919		panic("unreachable")
 920	}
 921}
 922
 923func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 924	return evalMatchTree(cp, cost, known, t.child)
 925}
 926
 927func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 928	return evalMatchTree(cp, cost, known, t.child)
 929}
 930
 931func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 932	if t.contEvaluated {
 933		return matchesStateForSlice(t.current)
 934	}
 935
 936	if len(t.current) == 0 {
 937		return matchesNone
 938	}
 939
 940	if t.fileName && cost < costMemory {
 941		return matchesRequiresHigherCost
 942	}
 943
 944	if !t.fileName && cost < costContent {
 945		return matchesRequiresHigherCost
 946	}
 947
 948	pruned := t.current[:0]
 949	for _, m := range t.current {
 950		if m.byteOffset == 0 && m.runeOffset > 0 {
 951			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 952		}
 953		if m.matchContent(cp.data(m.fileName)) {
 954			pruned = append(pruned, m)
 955		}
 956	}
 957	t.current = pruned
 958	t.contEvaluated = true
 959
 960	return matchesStateForSlice(t.current)
 961}
 962
 963type matchTreeOpt struct {
 964	// DisableWordMatchOptimization is used to disable the use of wordMatchTree.
 965	// This was added since we do not support wordMatchTree with symbol search.
 966	DisableWordMatchOptimization bool
 967}
 968
 969func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
 970	if q == nil {
 971		return nil, fmt.Errorf("got nil (sub)query")
 972	}
 973	switch s := q.(type) {
 974	case *query.Regexp:
 975		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 976		// superset of the regexp. If the returned matchTree is equivalent to the
 977		// original regexp, it returns true. An equivalent matchTree has the same
 978		// behaviour as the original regexp and can be used instead.
 979		//
 980		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 981		if err != nil {
 982			return nil, err
 983		}
 984		// if the query can be used in place of the regexp
 985		// return the subtree
 986		if isEq {
 987			return subMT, nil
 988		}
 989
 990		var tr matchTree
 991		if wmt, ok := regexpToWordMatchTree(s, opt); ok {
 992			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
 993			// provide something faster.
 994			tr = wmt
 995		} else {
 996			tr = newRegexpMatchTree(s)
 997		}
 998
 999		return &andMatchTree{
1000			children: []matchTree{
1001				tr, &noVisitMatchTree{subMT},
1002			},
1003		}, nil
1004	case *query.And:
1005		var r []matchTree
1006		for _, ch := range s.Children {
1007			ct, err := d.newMatchTree(ch, opt)
1008			if err != nil {
1009				return nil, err
1010			}
1011			r = append(r, ct)
1012		}
1013		return &andMatchTree{r}, nil
1014	case *query.Or:
1015		var r []matchTree
1016		for _, ch := range s.Children {
1017			ct, err := d.newMatchTree(ch, opt)
1018			if err != nil {
1019				return nil, err
1020			}
1021			r = append(r, ct)
1022		}
1023		return &orMatchTree{r}, nil
1024	case *query.Not:
1025		ct, err := d.newMatchTree(s.Child, opt)
1026		return &notMatchTree{
1027			child: ct,
1028		}, err
1029
1030	case *query.Type:
1031		if s.Type != query.TypeFileName {
1032			break
1033		}
1034
1035		ct, err := d.newMatchTree(s.Child, opt)
1036		if err != nil {
1037			return nil, err
1038		}
1039
1040		return &fileNameMatchTree{
1041			child: ct,
1042		}, nil
1043
1044	case *query.Boost:
1045		ct, err := d.newMatchTree(s.Child, opt)
1046		if err != nil {
1047			return nil, err
1048		}
1049
1050		return &boostMatchTree{
1051			child: ct,
1052			boost: s.Boost,
1053		}, nil
1054
1055	case *query.Substring:
1056		return d.newSubstringMatchTree(s)
1057
1058	case *query.Branch:
1059		masks := make([]uint64, 0, len(d.repoMetaData))
1060		if s.Pattern == "HEAD" {
1061			for range d.repoMetaData {
1062				masks = append(masks, 1)
1063			}
1064		} else {
1065			for _, branchIDs := range d.branchIDs {
1066				mask := uint64(0)
1067				for nm, m := range branchIDs {
1068					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1069						mask |= uint64(m)
1070					}
1071				}
1072				masks = append(masks, mask)
1073			}
1074		}
1075		return &branchQueryMatchTree{
1076			masks:     masks,
1077			fileMasks: d.fileBranchMasks,
1078			repos:     d.repos,
1079		}, nil
1080	case *query.Const:
1081		if s.Value {
1082			return &bruteForceMatchTree{}, nil
1083		} else {
1084			return &noMatchTree{Why: "const"}, nil
1085		}
1086	case *query.Language:
1087		code, ok := d.metaData.LanguageMap[s.Language]
1088		if !ok {
1089			return &noMatchTree{Why: "lang"}, nil
1090		}
1091		return &docMatchTree{
1092			reason:  "language",
1093			numDocs: d.numDocs(),
1094			predicate: func(docID uint32) bool {
1095				return d.getLanguage(docID) == code
1096			},
1097		}, nil
1098
1099	case *query.Symbol:
1100		// Disable WordMatchTree since we don't support it in symbols yet.
1101		optCopy := opt
1102		optCopy.DisableWordMatchOptimization = true
1103
1104		subMT, err := d.newMatchTree(s.Expr, optCopy)
1105		if err != nil {
1106			return nil, err
1107		}
1108
1109		if substr, ok := subMT.(*substrMatchTree); ok {
1110			return &symbolSubstrMatchTree{
1111				substrMatchTree: substr,
1112				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
1113				fileEndRunes:    d.fileEndRunes,
1114				fileEndSymbol:   d.fileEndSymbol,
1115				sections:        d.runeDocSections,
1116			}, nil
1117		}
1118
1119		var regexpMT *regexpMatchTree
1120		visitMatchTree(subMT, func(mt matchTree) {
1121			if t, ok := mt.(*regexpMatchTree); ok {
1122				regexpMT = t
1123			}
1124		})
1125		if regexpMT == nil {
1126			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1127		}
1128
1129		return &symbolRegexpMatchTree{
1130			regexp:    regexpMT.regexp,
1131			all:       isRegexpAll(regexpMT.origRegexp),
1132			matchTree: subMT,
1133		}, nil
1134
1135	case *query.FileNameSet:
1136		return &docMatchTree{
1137			reason:  "FileNameSet",
1138			numDocs: d.numDocs(),
1139			predicate: func(docID uint32) bool {
1140				fileName := d.fileName(docID)
1141				_, ok := s.Set[string(fileName)]
1142				return ok
1143			},
1144		}, nil
1145
1146	case *query.BranchesRepos:
1147		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1148		for repoIdx := range d.repoMetaData {
1149			var mask uint64
1150			for _, br := range s.List {
1151				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1152					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1153				}
1154			}
1155			reposBranchesWant[repoIdx] = mask
1156		}
1157		return &docMatchTree{
1158			reason:  "BranchesRepos",
1159			numDocs: d.numDocs(),
1160			predicate: func(docID uint32) bool {
1161				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1162			},
1163		}, nil
1164
1165	case *query.RepoSet:
1166		reposWant := make([]bool, len(d.repoMetaData))
1167		for repoIdx, r := range d.repoMetaData {
1168			if _, ok := s.Set[r.Name]; ok {
1169				reposWant[repoIdx] = true
1170			}
1171		}
1172		return &docMatchTree{
1173			reason:  "RepoSet",
1174			numDocs: d.numDocs(),
1175			predicate: func(docID uint32) bool {
1176				return reposWant[d.repos[docID]]
1177			},
1178		}, nil
1179
1180	case *query.RepoIDs:
1181		reposWant := make([]bool, len(d.repoMetaData))
1182		for repoIdx, r := range d.repoMetaData {
1183			if s.Repos.Contains(r.ID) {
1184				reposWant[repoIdx] = true
1185			}
1186		}
1187		return &docMatchTree{
1188			reason:  "RepoIDs",
1189			numDocs: d.numDocs(),
1190			predicate: func(docID uint32) bool {
1191				return reposWant[d.repos[docID]]
1192			},
1193		}, nil
1194
1195	case *query.Repo:
1196		reposWant := make([]bool, len(d.repoMetaData))
1197		for repoIdx, r := range d.repoMetaData {
1198			if s.Regexp.MatchString(r.Name) {
1199				reposWant[repoIdx] = true
1200			}
1201		}
1202		return &docMatchTree{
1203			reason:  "Repo",
1204			numDocs: d.numDocs(),
1205			predicate: func(docID uint32) bool {
1206				return reposWant[d.repos[docID]]
1207			},
1208		}, nil
1209
1210	case *query.RepoRegexp:
1211		reposWant := make([]bool, len(d.repoMetaData))
1212		for repoIdx, r := range d.repoMetaData {
1213			if s.Regexp.MatchString(r.Name) {
1214				reposWant[repoIdx] = true
1215			}
1216		}
1217		return &docMatchTree{
1218			reason:  "RepoRegexp",
1219			numDocs: d.numDocs(),
1220			predicate: func(docID uint32) bool {
1221				return reposWant[d.repos[docID]]
1222			},
1223		}, nil
1224
1225	case query.RawConfig:
1226		return &docMatchTree{
1227			reason:  s.String(),
1228			numDocs: d.numDocs(),
1229			predicate: func(docID uint32) bool {
1230				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1231			},
1232		}, nil
1233	}
1234	log.Panicf("type %T", q)
1235	return nil, nil
1236}
1237
1238func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1239	st := &substrMatchTree{
1240		query:         s,
1241		caseSensitive: s.CaseSensitive,
1242		fileName:      s.FileName,
1243	}
1244
1245	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1246		return newRegexpMatchTree(&query.Regexp{
1247			Regexp:        &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1248			FileName:      s.FileName,
1249			Content:       s.Content,
1250			CaseSensitive: s.CaseSensitive,
1251		}), nil
1252	}
1253
1254	result, err := d.iterateNgrams(s)
1255	if err != nil {
1256		return nil, err
1257	}
1258	st.matchIterator = result
1259	return st, nil
1260}
1261
1262func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1263	if opt.DisableWordMatchOptimization {
1264		return nil, false
1265	}
1266	// Needs to be case sensitive
1267	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1268		return nil, false
1269	}
1270	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1271	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1272		return nil, false
1273	}
1274	sub := q.Regexp.Sub
1275	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1276		return nil, false
1277	}
1278
1279	return &wordMatchTree{
1280		word:     string(sub[1].Rune),
1281		fileName: q.FileName,
1282	}, true
1283}
1284
1285// pruneMatchTree removes impossible branches from the matchTree, as indicated
1286// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1287func pruneMatchTree(mt matchTree) (matchTree, error) {
1288	var err error
1289	switch mt := mt.(type) {
1290	// leaf nodes that we test with the filter:
1291	case *substrMatchTree:
1292		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1293			if _, ok := res.matchIterator.(*noMatchTree); ok {
1294				return nil, nil
1295			}
1296		}
1297	// recursive tree structures:
1298	case *andMatchTree:
1299		// Any branch of an and becoming impossible means the entire clause
1300		// is impossible. Otherwise, just handle rewrites.
1301		for i, child := range mt.children {
1302			newChild, err := pruneMatchTree(child)
1303			if err != nil {
1304				return nil, err
1305			}
1306			if newChild == nil {
1307				return nil, nil
1308			}
1309			mt.children[i] = newChild
1310		}
1311	case *orMatchTree:
1312		// *All* branches of an OR becoming impossible means the entire clause
1313		// is impossible. Otherwise, drop impossible subclauses and handle
1314		// rewrites, including simplifying to a singular resulting child branch.
1315		n := 0
1316		for _, child := range mt.children {
1317			newChild, err := pruneMatchTree(child)
1318			if err != nil {
1319				return nil, err
1320			}
1321			if newChild != nil {
1322				mt.children[n] = newChild
1323				n++
1324			}
1325		}
1326		mt.children = mt.children[:n]
1327		if len(mt.children) == 1 {
1328			return mt.children[0], nil
1329		} else if len(mt.children) == 0 {
1330			return nil, nil
1331		}
1332	case *noVisitMatchTree:
1333		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1334		if err != nil {
1335			return nil, err
1336		}
1337		if mt.matchTree == nil {
1338			return nil, nil
1339		}
1340	case *fileNameMatchTree:
1341		mt.child, err = pruneMatchTree(mt.child)
1342		if err != nil {
1343			return nil, err
1344		}
1345		if mt.child == nil {
1346			return nil, nil
1347		}
1348	case *boostMatchTree:
1349		mt.child, err = pruneMatchTree(mt.child)
1350		if err != nil {
1351			return nil, err
1352		}
1353		if mt.child == nil {
1354			return nil, nil
1355		}
1356	case *andLineMatchTree:
1357		child, err := pruneMatchTree(&mt.andMatchTree)
1358		if err != nil {
1359			return nil, err
1360		}
1361		if child == nil {
1362			return nil, nil
1363		}
1364		if c, ok := child.(*andMatchTree); ok {
1365			mt.andMatchTree = *c
1366		} else {
1367			// the and was simplified to a single clause,
1368			// so the linematch portion is irrelevant.
1369			return mt, nil
1370		}
1371	case *notMatchTree:
1372		mt.child, err = pruneMatchTree(mt.child)
1373		if err != nil {
1374			return nil, err
1375		}
1376		if mt.child == nil {
1377			// not false => true
1378			return &bruteForceMatchTree{}, nil
1379		}
1380	// unhandled:
1381	case *docMatchTree:
1382	case *bruteForceMatchTree:
1383	case *regexpMatchTree:
1384	case *wordMatchTree:
1385	}
1386	return mt, err
1387}
1388
1389// isRegexpAll returns true if the query matches all possible lines.
1390//
1391// Note: it is possible for a funky regex to actually match all but this
1392// returns false. This returns true for normal looking regexes like ".*" or
1393// "(?-s:.*)".
1394func isRegexpAll(r *syntax.Regexp) bool {
1395	// Note: we don't care about any flags since we are looking for .* and we
1396	// don't mind if . matches all or everything but newline.
1397
1398	// for loop is for visiting children of OpCapture/OpConcat until we hit .*
1399	for {
1400		// Our main target: .*
1401		if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1402			// (?s:.) or (?-s:.)
1403			return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1404		}
1405
1406		// Strip away expressions being wrapped in paranthesis
1407		if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1408			r = r.Sub[0]
1409			continue
1410		}
1411
1412		return false
1413	}
1414}
Configure Feed

Configure Feed