From 900f5623e8d944c23741d85a89d895365931dbd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Tue, 20 Jun 2023 14:13:48 +0300 Subject: [PATCH 01/11] add error page classifier --- README.md | 3 +- common/errorpageclassifier/classifier.go | 195 +++++++++++++++++ common/errorpageclassifier/clf.gob | Bin 0 -> 3303 bytes common/errorpageclassifier/cm.go | 105 +++++++++ common/errorpageclassifier/dataset.txt | 201 ++++++++++++++++++ .../errorpageclassifier.go | 136 ++++++++++++ .../errorpageclassifier_test.go | 53 +++++ go.mod | 3 + go.sum | 6 + runner/options.go | 2 + runner/runner.go | 28 ++- runner/types.go | 1 + 12 files changed, 723 insertions(+), 10 deletions(-) create mode 100644 common/errorpageclassifier/classifier.go create mode 100644 common/errorpageclassifier/clf.gob create mode 100644 common/errorpageclassifier/cm.go create mode 100644 common/errorpageclassifier/dataset.txt create mode 100644 common/errorpageclassifier/errorpageclassifier.go create mode 100644 common/errorpageclassifier/errorpageclassifier_test.go diff --git a/README.md b/README.md index 4355b1cd..7ae77290 100644 --- a/README.md +++ b/README.md @@ -131,13 +131,14 @@ EXTRACTOR: FILTERS: -fc, -filter-code string filter response with specified status code (-fc 403,401) + -fep, -filter-error-page filter response with ML based error page detection -fl, -filter-length string filter response with specified content length (-fl 23,33) -flc, -filter-line-count string filter response body with specified line count (-flc 423,532) -fwc, -filter-word-count string filter response body with specified word count (-fwc 423,532) -ffc, -filter-favicon string[] filter response with specified favicon hash (-mfc 1494302000) -fs, -filter-string string filter response with specified string (-fs admin) -fe, -filter-regex string filter response with specified regex (-fe admin) - -fcdn, -filter-cdn string[] filter host with specified cdn provider (incapsula, oracle, google, azure, cloudflare, cloudfront, fastly, akamai, sucuri, leaseweb) + -fcdn, -filter-cdn string[] filter host with specified cdn provider (google, leaseweb, stackpath, cloudfront, fastly) -frt, -filter-response-time string filter response with specified response time in seconds (-frt '> 1') -fdc, -filter-condition string filter response with dsl expression condition diff --git a/common/errorpageclassifier/classifier.go b/common/errorpageclassifier/classifier.go new file mode 100644 index 00000000..c6553b2b --- /dev/null +++ b/common/errorpageclassifier/classifier.go @@ -0,0 +1,195 @@ +//ref: https://github.com/sausheong/gonb + +package errorpageclassifier + +import ( + "bytes" + "encoding/gob" + "fmt" + "os" + "regexp" + "sort" + "strings" + + "github.com/kljensen/snowball" +) + +var ( + cleaner = regexp.MustCompile(`[^\w\s]`) + stopWords = map[string]bool{"a": true, "able": true, "about": true, "above": true, "abroad": true, "according": true, "accordingly": true, "across": true, "actually": true, "adj": true, "after": true, "afterwards": true, "again": true, "against": true, "ago": true, "ahead": true, "ain't": true, "all": true, "allow": true, "allows": true, "almost": true, "alone": true, "along": true, "alongside": true, "already": true, "also": true, "although": true, "always": true, "am": true, "amid": true, "amidst": true, "among": true, "amongst": true, "an": true, "and": true, "another": true, "any": true, "anybody": true, "anyhow": true, "anyone": true, "anything": true, "anyway": true, "anyways": true, "anywhere": true, "apart": true, "appear": true, "appreciate": true, "appropriate": true, "are": true, "aren't": true, "around": true, "as": true, "a's": true, "aside": true, "ask": true, "asking": true, "associated": true, "at": true, "available": true, "away": true, "awfully": true, "b": true, "back": true, "backward": true, "backwards": true, "be": true, "became": true, "because": true, "become": true, "becomes": true, "becoming": true, "been": true, "before": true, "beforehand": true, "begin": true, "behind": true, "being": true, "believe": true, "below": true, "beside": true, "besides": true, "best": true, "better": true, "between": true, "beyond": true, "both": true, "brief": true, "but": true, "by": true, "c": true, "came": true, "can": true, "cannot": true, "cant": true, "can't": true, "caption": true, "cause": true, "causes": true, "certain": true, "certainly": true, "changes": true, "clearly": true, "c'mon": true, "co": true, "co.": true, "com": true, "come": true, "comes": true, "concerning": true, "consequently": true, "consider": true, "considering": true, "contain": true, "containing": true, "contains": true, "corresponding": true, "could": true, "couldn't": true, "course": true, "c's": true, "currently": true, "d": true, "dare": true, "daren't": true, "definitely": true, "described": true, "despite": true, "did": true, "didn't": true, "different": true, "directly": true, "do": true, "does": true, "doesn't": true, "doing": true, "done": true, "don't": true, "down": true, "downwards": true, "during": true, "e": true, "each": true, "edu": true, "eg": true, "eight": true, "eighty": true, "either": true, "else": true, "elsewhere": true, "end": true, "ending": true, "enough": true, "entirely": true, "especially": true, "et": true, "etc": true, "even": true, "ever": true, "evermore": true, "every": true, "everybody": true, "everyone": true, "everything": true, "everywhere": true, "ex": true, "exactly": true, "example": true, "except": true, "f": true, "fairly": true, "far": true, "farther": true, "few": true, "fewer": true, "fifth": true, "first": true, "five": true, "followed": true, "following": true, "follows": true, "for": true, "forever": true, "former": true, "formerly": true, "forth": true, "forward": true, "found": true, "four": true, "from": true, "further": true, "furthermore": true, "g": true, "get": true, "gets": true, "getting": true, "given": true, "gives": true, "go": true, "goes": true, "going": true, "gone": true, "got": true, "gotten": true, "greetings": true, "h": true, "had": true, "hadn't": true, "half": true, "happens": true, "hardly": true, "has": true, "hasn't": true, "have": true, "haven't": true, "having": true, "he": true, "he'd": true, "he'll": true, "hello": true, "help": true, "hence": true, "her": true, "here": true, "hereafter": true, "hereby": true, "herein": true, "here's": true, "hereupon": true, "hers": true, "herself": true, "he's": true, "hi": true, "him": true, "himself": true, "his": true, "hither": true, "hopefully": true, "how": true, "howbeit": true, "however": true, "hundred": true, "i": true, "i'd": true, "ie": true, "if": true, "ignored": true, "i'll": true, "i'm": true, "immediate": true, "in": true, "inasmuch": true, "inc": true, "inc.": true, "indeed": true, "indicate": true, "indicated": true, "indicates": true, "inner": true, "inside": true, "insofar": true, "instead": true, "into": true, "inward": true, "is": true, "isn't": true, "it": true, "it'd": true, "it'll": true, "its": true, "it's": true, "itself": true, "i've": true, "j": true, "just": true, "k": true, "keep": true, "keeps": true, "kept": true, "know": true, "known": true, "knows": true, "l": true, "last": true, "lately": true, "later": true, "latter": true, "latterly": true, "least": true, "less": true, "lest": true, "let": true, "let's": true, "like": true, "liked": true, "likely": true, "likewise": true, "little": true, "look": true, "looking": true, "looks": true, "low": true, "lower": true, "ltd": true, "m": true, "made": true, "mainly": true, "make": true, "makes": true, "many": true, "may": true, "maybe": true, "mayn't": true, "me": true, "mean": true, "meantime": true, "meanwhile": true, "merely": true, "might": true, "mightn't": true, "mine": true, "minus": true, "miss": true, "more": true, "moreover": true, "most": true, "mostly": true, "mr": true, "mrs": true, "much": true, "must": true, "mustn't": true, "my": true, "myself": true, "n": true, "name": true, "namely": true, "nd": true, "near": true, "nearly": true, "necessary": true, "need": true, "needn't": true, "needs": true, "neither": true, "never": true, "neverf": true, "neverless": true, "nevertheless": true, "new": true, "next": true, "nine": true, "ninety": true, "no": true, "nobody": true, "non": true, "none": true, "nonetheless": true, "noone": true, "no-one": true, "nor": true, "normally": true, "not": true, "nothing": true, "notwithstanding": true, "novel": true, "now": true, "nowhere": true, "o": true, "obviously": true, "of": true, "off": true, "often": true, "oh": true, "ok": true, "okay": true, "old": true, "on": true, "once": true, "one": true, "ones": true, "one's": true, "only": true, "onto": true, "opposite": true, "or": true, "other": true, "others": true, "otherwise": true, "ought": true, "oughtn't": true, "our": true, "ours": true, "ourselves": true, "out": true, "outside": true, "over": true, "overall": true, "own": true, "p": true, "particular": true, "particularly": true, "past": true, "per": true, "perhaps": true, "placed": true, "please": true, "plus": true, "possible": true, "presumably": true, "probably": true, "provided": true, "provides": true, "q": true, "que": true, "quite": true, "qv": true, "r": true, "rather": true, "rd": true, "re": true, "really": true, "reasonably": true, "recent": true, "recently": true, "regarding": true, "regardless": true, "regards": true, "relatively": true, "respectively": true, "right": true, "round": true, "s": true, "said": true, "same": true, "saw": true, "say": true, "saying": true, "says": true, "second": true, "secondly": true, "see": true, "seeing": true, "seem": true, "seemed": true, "seeming": true, "seems": true, "seen": true, "self": true, "selves": true, "sensible": true, "sent": true, "serious": true, "seriously": true, "seven": true, "several": true, "shall": true, "shan't": true, "she": true, "she'd": true, "she'll": true, "she's": true, "should": true, "shouldn't": true, "since": true, "six": true, "so": true, "some": true, "somebody": true, "someday": true, "somehow": true, "someone": true, "something": true, "sometime": true, "sometimes": true, "somewhat": true, "somewhere": true, "soon": true, "sorry": true, "specified": true, "specify": true, "specifying": true, "still": true, "sub": true, "such": true, "sup": true, "sure": true, "t": true, "take": true, "taken": true, "taking": true, "tell": true, "tends": true, "th": true, "than": true, "thank": true, "thanks": true, "thanx": true, "that": true, "that'll": true, "thats": true, "that's": true, "that've": true, "the": true, "their": true, "theirs": true, "them": true, "themselves": true, "then": true, "thence": true, "there": true, "thereafter": true, "thereby": true, "there'd": true, "therefore": true, "therein": true, "there'll": true, "there're": true, "theres": true, "there's": true, "thereupon": true, "there've": true, "these": true, "they": true, "they'd": true, "they'll": true, "they're": true, "they've": true, "thing": true, "things": true, "think": true, "third": true, "thirty": true, "this": true, "thorough": true, "thoroughly": true, "those": true, "though": true, "three": true, "through": true, "throughout": true, "thru": true, "thus": true, "till": true, "to": true, "together": true, "too": true, "took": true, "toward": true, "towards": true, "tried": true, "tries": true, "truly": true, "try": true, "trying": true, "t's": true, "twice": true, "two": true, "u": true, "un": true, "under": true, "underneath": true, "undoing": true, "unfortunately": true, "unless": true, "unlike": true, "unlikely": true, "until": true, "unto": true, "up": true, "upon": true, "upwards": true, "us": true, "use": true, "used": true, "useful": true, "uses": true, "using": true, "usually": true, "v": true, "value": true, "various": true, "versus": true, "very": true, "via": true, "viz": true, "vs": true, "w": true, "want": true, "wants": true, "was": true, "wasn't": true, "way": true, "we": true, "we'd": true, "welcome": true, "well": true, "we'll": true, "went": true, "were": true, "we're": true, "weren't": true, "we've": true, "what": true, "whatever": true, "what'll": true, "what's": true, "what've": true, "when": true, "whence": true, "whenever": true, "where": true, "whereafter": true, "whereas": true, "whereby": true, "wherein": true, "where's": true, "whereupon": true, "wherever": true, "whether": true, "which": true, "whichever": true, "while": true, "whilst": true, "whither": true, "who": true, "who'd": true, "whoever": true, "whole": true, "who'll": true, "whom": true, "whomever": true, "who's": true, "whose": true, "why": true, "will": true, "willing": true, "wish": true, "with": true, "within": true, "without": true, "wonder": true, "won't": true, "would": true, "wouldn't": true, "x": true, "y": true, "yes": true, "yet": true, "you": true, "you'd": true, "you'll": true, "your": true, "you're": true, "yours": true, "yourself": true, "yourselves": true, "you've": true, "z": true, "zero": true} +) + +type Sorted struct { + Category string + Probability float64 +} + +// Classifier is what we use to classify documents +type Classifier struct { + Words map[string]map[string]int + TotalWords int + CategoriesDocuments map[string]int + TotalDocuments int + CategoriesWords map[string]int + Threshold float64 +} + +// create and initialize the classifier +func NewClassifier(categories []string, threshold float64) *Classifier { + classifier := &Classifier{ + Words: make(map[string]map[string]int), + TotalWords: 0, + CategoriesDocuments: make(map[string]int), + TotalDocuments: 0, + CategoriesWords: make(map[string]int), + Threshold: threshold, + } + + for _, category := range categories { + classifier.Words[category] = make(map[string]int) + classifier.CategoriesDocuments[category] = 0 + classifier.CategoriesWords[category] = 0 + } + return classifier +} + +// create and initialize the classifier from a file +func NewClassifierFromFile(path string) (*Classifier, error) { + classifier := &Classifier{} + + fl, err := os.Open(path) + if err != nil { + + return classifier, err + } + defer fl.Close() + + dErr := gob.NewDecoder(fl).Decode(classifier) + if dErr != nil { + return classifier, dErr + } + + return classifier, nil +} + +// create and initialize the classifier from a file data +func NewClassifierFromFileData(data []byte) (*Classifier, error) { + classifier := &Classifier{} + err := gob.NewDecoder(bytes.NewReader(data)).Decode(classifier) + if err != nil { + return classifier, err + } + + return classifier, nil +} + +// save the classifier to a file +func (c *Classifier) SaveClassifierToFile(path string) error { + fl, err := os.Create(path) + if err != nil { + return err + } + defer fl.Close() + + err = gob.NewEncoder(fl).Encode(&c) + if err != nil { + return err + } + + return nil +} + +// Train the classifier +func (c *Classifier) Train(category string, document string) { + for word, count := range countWords(document) { + c.Words[category][word] += count + c.CategoriesWords[category] += count + c.TotalWords += count + } + c.CategoriesDocuments[category]++ + c.TotalDocuments++ +} + +// Classify a document +func (c *Classifier) Classify(document string) (category string) { + // get all the probabilities of each category + prob := c.Probabilities(document) + + // sort the categories according to probabilities + var sp []Sorted + for c, p := range prob { + sp = append(sp, Sorted{c, p}) + } + sort.Slice(sp, func(i, j int) bool { + return sp[i].Probability > sp[j].Probability + }) + + // if the highest probability is above threshold select that + if sp[0].Probability/sp[1].Probability > c.Threshold { + category = sp[0].Category + } else { + category = "other" + } + + return +} + +// Probabilities of each category +func (c *Classifier) Probabilities(document string) (p map[string]float64) { + p = make(map[string]float64) + for category := range c.Words { + p[category] = c.pCategoryDocument(category, document) + } + return +} + +// p (document | category) +func (c *Classifier) pDocumentCategory(category string, document string) (p float64) { + p = 1.0 + for word := range countWords(document) { + p = p * c.pWordCategory(category, word) + } + return p +} + +func (c *Classifier) pWordCategory(category string, word string) float64 { + return float64(c.Words[category][stem(word)]+1) / float64(c.CategoriesWords[category]) +} + +// p (category) +func (c *Classifier) pCategory(category string) float64 { + return float64(c.CategoriesDocuments[category]) / float64(c.TotalDocuments) +} + +// p (category | document) +func (c *Classifier) pCategoryDocument(category string, document string) float64 { + return c.pDocumentCategory(category, document) * c.pCategory(category) +} + +// clean up and split words in document, then stem each word and count the occurrence +func countWords(document string) (wordCount map[string]int) { + cleaned := cleanDocument(document) + words := strings.Split(cleaned, " ") + wordCount = make(map[string]int) + for _, word := range words { + if !stopWords[word] { + key := stem(strings.ToLower(word)) + wordCount[key]++ + } + } + return +} + +func cleanDocument(text string) string { + return cleaner.ReplaceAllString(text, "") +} + +// stem a word using the Snowball algorithm +func stem(word string) string { + stemmed, err := snowball.Stem(word, "english", true) + if err == nil { + return stemmed + } + fmt.Println("Cannot stem word:", word) + return word +} diff --git a/common/errorpageclassifier/clf.gob b/common/errorpageclassifier/clf.gob new file mode 100644 index 0000000000000000000000000000000000000000..f6978625dcf2fcd392e531f936a16151eb4c44fc GIT binary patch literal 3303 zcmYjU%Z}to70nD^ha;0zQEEo50kIqjsaHi$!Is6aMhK0>0wED;hwZ4cBi!-GtX$6U z_yQKdhp=P;dp?1GV8^P8b0f1JEp=CSrX6wbx#w~F>6@o-s`YyN)udyzy)|L|^zCXb z*Y&S`=;He6PpkFzYoGLV{x_}G-}_1@Gx%Ul{JC%QY+S-~e^{-r%5$%u;o0xLK6^P2 z&uzXQgNdV`y0u!ZRv(`JNbB|YXTAI;reNLh+c$r;E|nT~e8cMM>G!l2eAjMGtJUM~ zM^E3bMQuXx;pyMiPi}7MJITz84?5V%QXNz7tf+@zBxi!yjCL3&{7!=UJWWE*p64T~glih>4tla3SLTvN ziL#x{!33%o3?$+l6QYfY>OSBJs)L#RK>$S`j6!F?D`h9E941A47YYHEw0QxY>Iv^S zRR^wus5d&$_5!%oPSj~M!F*mHY-hYsz0a=0)TtSO@ll^tV*o>TN!7S5&q5B9P3Grz47Ya=rtk1Dv-Q_mP)fe2-jB!cT27`A1I%aqPcEF&s zfFNabXT^3l^B(Pwb^+3(jaL=VjxZrj)B<20KtMf~VVDI4?~{Qi&#_(tGXEYyVXOhNc6pZc!?(cT8_tWH$D3334EmLOl zEyHrK<|vx%^a1beRCloofexe3q1_VSr3gc@sx}WOdIz+1Qh4d!`J&s|F$+R=iA zzHM{3!YCr7Vf*)BaR^0dK)^UXi$-@{Au_J#b_sX&Y&C*9@Of83Y)T%@9#9vO>_OIR znS#&zNwPxkaGsFPC!WzJ=6!5~U6QEFM}l}{Pk*WQ8O(q*%(jNG;3PC^aH>uLL{!wn z8Q#^|<(N?^8Ui_%Nw=6LwYtpdho);v+Cq=v@*py&iF zc6>4@!$b$()O6)saN+jm$5eIMP=!LNMz=bNt1+d;JZN?y*odfS%|fbh6rW)an!RI2 zOD=hM5i~#WQD1_k8YCL20bN*&RGYo`_d*nu&jIj?js|>etI$R-_;QVwLh#NIa+@t1vFiRO^GUFgW8IS&uyB={H8m8v_x-|`jAZ^(UyFA;V2U<9bgFLWRo7f?nI zkHY8yZ4beeLH~*zyrmF@d4UVeF~SsJ8kp3?DbbDw0|)3w1rLls!8 zm=^@g59%_qXZN5Qy08P#fR>8U>UNX}p9r{MI>7K+&QPizb}xjz3_~ej#GGNKFqM$- z9u#5@LKPQa#1I>92s-6y_iXkO_WvncKqld422EB)_-AInwLhq4C>8>fhEh2Gps!8` zc(9FaNrVwc2sy#CBkqp?nbA9YDpIF-sVKI&RRts_#~VQ@R`_JPK{W8%;=r{JgLYH{ zJTR!a!-$+Y3BiVLT`Z}#>2w_TUWX3;%Pw?L6gPayNmUrfljN|!=J4ItkdSo}H^7Lu zb!P{(Rdq0Fy7T-+vFdGY+?_v3ve1Nh4u*@EV9x`J^F8RnVKq&VYMFHS1>CPq*5jOW zCs6c>V^>lvwg7*J1va2<=1{ztuzZehh+~@c9=pgThigOt4u4)5s2-Rl*HE7VpIc)O zFCn8!TJYV^USDHaaQdpa1~XQjZHYicb^!usPMjy_L;yZmS}snve+>Zf@s?4X(94qeY?c&kSzAhI zf?J&Ya2_m_L@=~al-b)b?`lXEGksZ6*r^=~L}?`<-YgKTsc0@rhxQ6d%C7UeNU&97 z%Rzu|C2w?Wdq-R^h*yp6vvEtPJ8SHSa2w3$rr8^3dI&>;4Krd3%bS_NX}E3Lt*~F= zzX&j0EG0*yr!?{|r``Sgmn^w#3y8TkfYMlt^(Rpc234XH%of2`)?m1*WIg2t5`ndp zB*Ur^k*Wq;7uy#S1TiaW2m?RXkK2E~Z2EtHwfg_8AJ>0={^C)-hQK5JWBtFs{gr?H I_h+mB15Pg-Pyhe` literal 0 HcmV?d00001 diff --git a/common/errorpageclassifier/cm.go b/common/errorpageclassifier/cm.go new file mode 100644 index 00000000..c66dee30 --- /dev/null +++ b/common/errorpageclassifier/cm.go @@ -0,0 +1,105 @@ +package errorpageclassifier + +import "fmt" + +type ConfusionMatrix struct { + matrix [][]int + labels []string +} + +func NewConfusionMatrix(actual, predicted []string, labels []string) *ConfusionMatrix { + n := len(labels) + matrix := make([][]int, n) + for i := range matrix { + matrix[i] = make([]int, n) + } + + labelIndices := make(map[string]int) + for i, label := range labels { + labelIndices[label] = i + } + + for i := range actual { + matrix[labelIndices[actual[i]]][labelIndices[predicted[i]]]++ + } + + return &ConfusionMatrix{ + matrix: matrix, + labels: labels, + } +} + +func (cm *ConfusionMatrix) PrintConfusionMatrix() { + fmt.Printf("%30s\n", "Confusion Matrix") + fmt.Println() + // Print header + fmt.Printf("%-15s", "") + for _, label := range cm.labels { + fmt.Printf("%-15s", label) + } + fmt.Println() + + // Print rows + for i, row := range cm.matrix { + fmt.Printf("%-15s", cm.labels[i]) + for _, value := range row { + fmt.Printf("%-15d", value) + } + fmt.Println() + } + fmt.Println() +} + +func (cm *ConfusionMatrix) PrintClassificationReport() { + fmt.Printf("%30s\n", "Classification Report") + fmt.Println() + + fmt.Printf("\n%-15s %-10s %-10s %-10s %-10s\n", "", "precision", "recall", "f1-score", "support") + + totals := map[string]float64{"true": 0, "predicted": 0, "correct": 0} + macroAvg := map[string]float64{"precision": 0, "recall": 0, "f1-score": 0} + + for i, label := range cm.labels { + truePos := cm.matrix[i][i] + falsePos, falseNeg := 0, 0 + for j := 0; j < len(cm.labels); j++ { + if i != j { + falsePos += cm.matrix[j][i] + falseNeg += cm.matrix[i][j] + } + } + + precision := float64(truePos) / float64(truePos+falsePos) + recall := float64(truePos) / float64(truePos+falseNeg) + f1Score := 2 * precision * recall / (precision + recall) + support := truePos + falseNeg + + fmt.Printf("%-15s %-10.2f %-10.2f %-10.2f %-10d\n", label, precision, recall, f1Score, support) + + totals["true"] += float64(support) + totals["predicted"] += float64(truePos + falsePos) + totals["correct"] += float64(truePos) + + macroAvg["precision"] += precision + macroAvg["recall"] += recall + macroAvg["f1-score"] += f1Score + } + + accuracy := totals["correct"] / totals["true"] + fmt.Printf("\n%-26s %-10s %-10.2f %-10d", "accuracy", "", accuracy, int(totals["true"])) + + fmt.Printf("\n%-15s %-10.2f %-10.2f %-10.2f %-10d\n", "macro avg", + macroAvg["precision"]/float64(len(cm.labels)), + macroAvg["recall"]/float64(len(cm.labels)), + macroAvg["f1-score"]/float64(len(cm.labels)), + int(totals["true"])) + + precisionWeightedAvg := totals["correct"] / totals["predicted"] + recallWeightedAvg := totals["correct"] / totals["true"] + f1ScoreWeightedAvg := 2 * precisionWeightedAvg * recallWeightedAvg / (precisionWeightedAvg + recallWeightedAvg) + + fmt.Printf("%-15s %-10.2f %-10.2f %-10.2f %-10d\n", "weighted avg", + precisionWeightedAvg, recallWeightedAvg, f1ScoreWeightedAvg, int(totals["true"])) + + fmt.Println() +} diff --git a/common/errorpageclassifier/dataset.txt b/common/errorpageclassifier/dataset.txt new file mode 100644 index 00000000..1f9ee4e3 --- /dev/null +++ b/common/errorpageclassifier/dataset.txt @@ -0,0 +1,201 @@ +The Forum page seems to have a glitch. Our technicians are on it.||error +There was a problem with the Product Details page. Try reloading.||error +Error 500: The E-books page is experiencing a problem.||error +Unfortunately, the Video Tutorials page is down for maintenance.||error +Our Archive page is currently unavailable. We apologize for the inconvenience.||error +We're having trouble loading the Membership Details page.||error +An error occurred while trying to access the Profile Settings page.||error +Error 404: The Team page could not be found.||error +Our Project Highlights page seems to be having some technical issues.||error +We're sorry, but we can't seem to find the Donations page.||error +You've landed on our Forum page. Engage in interesting discussions.||nonerror +Welcome to the Product Details page. Learn more about our products here.||nonerror +You are now on our E-books page. Enjoy a wealth of knowledge.||nonerror +This is the Video Tutorials page. Learn with our easy-to-follow videos.||nonerror +Welcome to our Archive. Dive into our rich history.||nonerror +You're now on the Membership Details page. See the benefits of joining us.||nonerror +This is your Profile Settings page. Update your personal details as needed.||nonerror +You're on the Team page. Meet the people behind our organization.||nonerror +Welcome to our Project Highlights page. See what we've been up to.||nonerror +You've landed on the Donations page. Every contribution helps us do more.||nonerror +500 - Server Error This is highly unusual! Our tech team have been notified and are working on it.||error +Sorry this page is currently under maintenance.||error +Access Denied - You don't have permission to access this page.||error +This page seems to be missing 404 Error!||error +Sorry something went wrong. Please try again later.||error +We're sorry this page could not be found 404.||error +The page you requested could not be found on our site.||error +500 - Internal server error. There is a problem with the resource you are looking for and it cannot be displayed.||error +Error 401 Unauthorized: Access is denied due to invalid credentials.||error +Bad request 400. Your browser sent a request that this server could not understand.||error +This is a 404 error page||error +Sorry this page does not exist||error +Error 500: Internal Server Error||error +Oops! That page can’t be found.Try searching from the field above or go to the home page.||error +An error has occurred while processing your request. It happens to the best of us! Don't worry! There are no bugs without a fix! Let's try again! What were you looking for? If you are an adventurer search this site! If difficulties persist please contact the website administrator and report the error below. 404 Page not found||error +Whoops our bad... The page you requested was not found and we have a fine guess why. If you typed the URL directly please make sure the spelling is correct. If you clicked on a link to get here the link is outdated. What can you do? Have no fear help is near! There are many ways you can get back on track with Magento Store. Go back to the previous page. Use the search bar at the top of the page to search for your products. Follow these links to get you back on track! Store Home My Account||error +404 - Page not found Unfortunately the requested page could not be found.||error +PAGE NOT FOUND The page you're looking for doesn't seem to exist anymore… Return to the homepage||error +Who moved my... lemon? Oh no - looks like we can't find the page you are looking for. But you know the saying; when life gives you lemons... okay we can't find a clever way to end that sentence but we do have 2 suggestions to help you find what you were looking for: Go to the front page Or Search for a specific topic If something you need really is missing we would love it if you would let us know ❤️️||error +404—page not found||error +Apologies but there's a 503 Service Unavailable error. The server cannot handle the request.||error +Sorry you don't have access rights to this page. Error 403: Forbidden.||error +404 - Oops! The page you are looking for has been misplaced.||error +Sorry the server encountered an unexpected condition that prevented it from fulfilling the request. Error 500: Internal Server Error.||error +Whoa! The page you're looking for seems to have vanished. Error 404.||error +Sorry this page has moved or doesn't exist anymore. Error 404.||error +Sorry but your request timed out. Please try again. Error 504: Gateway Timeout.||error +We're sorry but an unknown error occurred while processing your request.||error +Error 502: Bad Gateway. The server encountered a temporary error and could not complete your request.||error +The requested resource could not be found on this server. Please verify your request and try again. Error 404.||error +This Help Center page is temporarily unavailable.||error +Privacy Policy page not found. Please try again later.||error +There seems to be an error on our Services page. We're working to fix it.||error +An error occurred while loading the Search Results page.||error +Category page not found. It might have been removed or relocated.||error +There was a problem loading the Cart page. Please try again.||error +Our Terms of Service page is currently down for maintenance.||error +We're sorry, but the Sitemap is not available at the moment.||error +We're having trouble loading the Reviews page.||error +An error occurred while trying to access the Partners page.||error +Settings page is currently unavailable. We apologize for the inconvenience.||error +Error 404: Resources page not found.||error +Our Press Releases page seems to be having some technical issues.||error +We're sorry, but we can't seem to find the Case Studies page.||error +There was a problem loading the Community page. Please refresh the page.||error +Error 503: The Subscriptions page is temporarily unavailable.||error +There's a problem with our Customer Support page. We're on it.||error +We're having trouble finding the Notifications page. It may have been moved.||error +There was a problem with the Feedback page. Try again later.||error +Our Transactions page is currently experiencing some issues. We appreciate your patience.||error +Your request has been successfully submitted.||nonerror +You have successfully logged out.||nonerror +Congratulations on successfully completing the course!||nonerror +The payment has been processed successfully.||nonerror +Thank you for your feedback!||nonerror +Your download will start shortly.||nonerror +Profile updated successfully.||nonerror +Thanks for contacting us! We'll get back to you as soon as possible.||nonerror +Sign-up successful. Welcome to our community!||nonerror +Your booking has been confirmed. Check your email for details.||nonerror +Welcome! Your registration was successful.||nonerror +Congratulations! You've successfully updated your profile.||nonerror +Great! Your order was placed successfully. We'll send you an email confirmation soon.||nonerror +Welcome back! Your login was successful.||nonerror +Success! You've added the item to your cart.||nonerror +Your request was sent successfully. We'll get back to you as soon as possible.||nonerror +Great job! Your settings have been saved.||nonerror +Your message has been submitted successfully. We appreciate your feedback.||nonerror +Thank you for subscribing to our newsletter!||nonerror +Great news! Your transaction was successful.||nonerror +Welcome to our homepage. Feel free to browse around||nonerror +Thanks for signing up! You're now a registered user.||nonerror +Your order has been placed successfully! You'll receive a confirmation email shortly||nonerror +Congratulations your account has been successfully created||nonerror +Thank you for your inquiry. We will respond to your message within 24 hours||nonerror +You've successfully added the item to your cart!||nonerror +Success! Your password has been updated||nonerror +Welcome back! You have successfully logged in||nonerror +Great job! Your profile has been updated||nonerror +Your message was sent successfully. We'll get back to you shortly||nonerror +Welcome to our website. Explore and enjoy our services.||nonerror +Thank you for visiting our About Us page. Learn more about our journey and team.||nonerror +You are now browsing our Products page. Check out our latest offerings.||nonerror +This is our Contact Us page. Feel free to reach out with any queries or feedback.||nonerror +You have reached the end of the page. Scroll up to continue browsing.||nonerror +Welcome to the News section. Stay updated with our latest announcements.||nonerror +Now viewing: Image Gallery. Enjoy a visual tour of our activities.||nonerror +You're on our FAQ page. Get answers to common questions.||nonerror +Welcome to the Blog section. Engage with our thoughts and insights.||nonerror +This is the Discussion Forum. Join in, ask questions, or help others.||nonerror +You're on the Login page. Enter your credentials to access your account.||nonerror +Welcome to the Sign-Up page. Join our community today.||nonerror +This is your User Dashboard. Manage your account and settings here.||nonerror +You've reached the Checkout page. Review your order and proceed to payment.||nonerror +Welcome to the Download section . Access our digital resources here.||nonerror +This is the Careers page. Explore job opportunities with us.||nonerror +You're viewing the Events Calendar. Keep track of upcoming activities.||nonerror +This is the User Profile page. Update your information as needed.||nonerror +Welcome to our Testimonials page. Read reviews and stories from our users.||nonerror +You are now on the Home page. Start exploring from here.||nonerror +Welcome to home page||nonerror +You're now on our Help Center page. Find answers to common questions here.||nonerror +Welcome to our Privacy Policy page. Learn how we protect your personal information.||nonerror +You've landed on the Services page. Explore what we have to offer.||nonerror +This is the Search Results page. Did you find what you were looking for?||nonerror +Now browsing the Category page. View all items in this category.||nonerror +You're now on the Cart page. Review your selections before proceeding to checkout.||nonerror +Welcome to our Terms of Service page. Understand our conditions for providing services.||nonerror +You are currently on our Sitemap. Navigate our website with ease.||nonerror +You are on the Reviews page. Check out what others have to say about us.||nonerror +Now viewing the Partners page. Meet the organizations we collaborate with.||nonerror +You're on the Settings page. Customize your user experience.||nonerror +This is our Resources page. Access useful documents and guides.||nonerror +You've landed on the Press Releases page. Stay updated with our latest news.||nonerror +Welcome to our Case Studies page. Discover our past projects and achievements.||nonerror +You're now on the Community page. Connect and interact with other members.||nonerror +You are currently on the Subscriptions page. Manage your preferences here.||nonerror +Now viewing the Customer Support page. We're here to help.||nonerror +This is the Notifications page. Keep track of your updates and alerts.||nonerror +You've landed on the Feedback page. Share your thoughts with us.||nonerror +Welcome to the Transactions page. Monitor your past and current transactions.||nonerror +500 - Server Error This is highly unusual! Our tech team have been notified and are working on it.||error +Sorry this page is currently under maintenance.||error +Access Denied - You don't have permission to access this page.||error +This page seems to be missing 404 Error!||error +Sorry something went wrong. Please try again later.||error +We're sorry this page could not be found 404.||error +The page you requested could not be found on our site.||error +500 - Internal server error. There is a problem with the resource you are looking for and it cannot be displayed.||error +Error 401 Unauthorized: Access is denied due to invalid credentials.||error +Bad request 400. Your browser sent a request that this server could not understand.||error +Your request has been successfully submitted.||nonerror +You have successfully logged out.||nonerror +Congratulations on successfully completing the course!||nonerror +The payment has been processed successfully.||nonerror +Thank you for your feedback!||nonerror +Your download will start shortly.||nonerror +Profile updated successfully.||nonerror +Thanks for contacting us! We'll get back to you as soon as possible.||nonerror +Sign-up successful. Welcome to our community!||nonerror +Your booking has been confirmed. Check your email for details.||nonerror +This is a 404 error page||error +Sorry this page does not exist||error +Error 500: Internal Server Error||error +Oops! That page can’t be found.Try searching from the field above or go to the home page.||error +An error has occurred while processing your request. It happens to the best of us! Don't worry! There are no bugs without a fix! Let's try again! What were you looking for? If you are an adventurer search this site! If difficulties persist please contact the website administrator and report the error below. 404 Page not found||error +Whoops our bad... The page you requested was not found and we have a fine guess why. If you typed the URL directly please make sure the spelling is correct. If you clicked on a link to get here the link is outdated. What can you do? Have no fear help is near! There are many ways you can get back on track with Magento Store. Go back to the previous page. Use the search bar at the top of the page to search for your products. Follow these links to get you back on track! Store Home | My Account||error +404 - Page not found Unfortunately the requested page could not be found.||error +PAGE NOT FOUND The page you're looking for doesn't seem to exist anymore… Return to the homepage||error +Who moved my... lemon? Oh no - looks like we can't find the page you are looking for. But you know the saying; when life gives you lemons... okay we can't find a clever way to end that sentence but we do have 2 suggestions to help you find what you were looking for: Go to the front page Or Search for a specific topic If something you need really is missing we would love it if you would let us know ❤️️||error +404—page not found||error +Apologies but there's a 503 Service Unavailable error. The server cannot handle the request.||error +Sorry you don't have access rights to this page. Error 403: Forbidden.||error +404 - Oops! The page you are looking for has been misplaced.||error +Sorry the server encountered an unexpected condition that prevented it from fulfilling the request. Error 500: Internal Server Error.||error +Whoa! The page you're looking for seems to have vanished. Error 404.||error +Sorry this page has moved or doesn't exist anymore. Error 404.||error +Sorry but your request timed out. Please try again. Error 504: Gateway Timeout.||error +We're sorry but an unknown error occurred while processing your request.||error +Error 502: Bad Gateway. The server encountered a temporary error and could not complete your request.||error +The requested resource could not be found on this server. Please verify your request and try again. Error 404.||error +Welcome! Your registration was successful.||nonerror +Congratulations! You've successfully updated your profile.||nonerror +Great! Your order was placed successfully. We'll send you an email confirmation soon.||nonerror +Welcome back! Your login was successful.||nonerror +Success! You've added the item to your cart.||nonerror +Your request was sent successfully. We'll get back to you as soon as possible.||nonerror +Great job! Your settings have been saved.||nonerror +Your message has been submitted successfully. We appreciate your feedback.||nonerror +Thank you for subscribing to our newsletter!||nonerror +Great news! Your transaction was successful.||nonerror +Welcome to our homepage. Feel free to browse around||nonerror +Thanks for signing up! You're now a registered user.||nonerror +Your order has been placed successfully! You'll receive a confirmation email shortly||nonerror +Congratulations your account has been successfully created||nonerror +Thank you for your inquiry. We will respond to your message within 24 hours||nonerror +You've successfully added the item to your cart!||nonerror +Success! Your password has been updated||nonerror +Welcome back! You have successfully logged in||nonerror +Great job! Your profile has been updated||nonerror +Your message was sent successfully. We'll get back to you shortly||nonerror \ No newline at end of file diff --git a/common/errorpageclassifier/errorpageclassifier.go b/common/errorpageclassifier/errorpageclassifier.go new file mode 100644 index 00000000..ece886dd --- /dev/null +++ b/common/errorpageclassifier/errorpageclassifier.go @@ -0,0 +1,136 @@ +package errorpageclassifier + +import ( + _ "embed" + "fmt" + "math/rand" + "strings" + + "github.com/jaytaylor/html2text" +) + +const ( + modelPath = "clf.gob" + threshold = 1.1 + testPercentage = 0.2 +) + +var categories = []string{"error", "nonerror"} + +type Document struct { + Class string + Text string +} + +//go:embed dataset.txt +var dataset string + +//go:embed clf.gob +var classifierData []byte + +type ErrorPageClassifier struct { + classifier *Classifier +} + +func New() *ErrorPageClassifier { + classifier, err := NewClassifierFromFileData(classifierData) + if err != nil { + panic(err) + } + return &ErrorPageClassifier{classifier: classifier} +} + +func (n *ErrorPageClassifier) Classify(html string) string { + text, err := htmlToText(html) + if err != nil { + panic(err) + } + + if text == "" { + return "other" + } + cls := n.classifier.Classify(text) + return cls +} + +func (epc *ErrorPageClassifier) Evaluate() { + train, test := trainTestSplit() + fmt.Println("no of docs in TRAIN dataset:", len(train)) + fmt.Println("no of docs in TEST dataset:", len(test)) + + fmt.Println("Evaluating classifier on test set:") + actualTest, predictedTest := epc.testClf(test) + confusionMatrixTest := NewConfusionMatrix(actualTest, predictedTest, []string{"error", "nonerror"}) + confusionMatrixTest.PrintConfusionMatrix() + confusionMatrixTest.PrintClassificationReport() + + fmt.Println("Evaluating classifier on the first 100 docs in the train set:") + actualValidate, predictedValidate := epc.validateClf(train[0:100]) + confusionMatrixValidate := NewConfusionMatrix(actualValidate, predictedValidate, []string{"error", "nonerror"}) + confusionMatrixValidate.PrintConfusionMatrix() + confusionMatrixValidate.PrintClassificationReport() +} + +func (epc *ErrorPageClassifier) testClf(test []Document) ([]string, []string) { + actual := []string{} + predicted := []string{} + + for _, doc := range test { + class := epc.classifier.Classify(doc.Text) + actual = append(actual, doc.Class) + predicted = append(predicted, class) + } + return actual, predicted +} + +func (epc *ErrorPageClassifier) validateClf(validation []Document) ([]string, []string) { + actual := []string{} + predicted := []string{} + + for _, doc := range validation { + actual = append(actual, doc.Class) + sentiment := epc.classifier.Classify(doc.Text) + predicted = append(predicted, sentiment) + } + return actual, predicted +} + +func TrainAndSave() { + train, test := trainTestSplit() + clf := NewClassifier(categories, threshold) + + fmt.Println("no of docs in TRAIN dataset:", len(train)) + fmt.Println("no of docs in TEST dataset:", len(test)) + + for _, doc := range train { + clf.Train(doc.Class, doc.Text) + } + + err := clf.SaveClassifierToFile(modelPath) + if err != nil { + panic(err) + } +} + +func trainTestSplit() (train, test []Document) { + data := strings.Split(dataset, "\n") + for _, line := range data { + s := strings.Split(line, "||") + doc, sentiment := s[0], s[1] + + if rand.Float64() > testPercentage { + train = append(train, Document{sentiment, doc}) + } else { + test = append(test, Document{sentiment, doc}) + } + } + return train, test +} + +func htmlToText(html string) (string, error) { + text, err := html2text.FromString(html, html2text.Options{TextOnly: true}) + if err != nil { + return "", err + } + return text, nil +} diff --git a/common/errorpageclassifier/errorpageclassifier_test.go b/common/errorpageclassifier/errorpageclassifier_test.go new file mode 100644 index 00000000..35923b28 --- /dev/null +++ b/common/errorpageclassifier/errorpageclassifier_test.go @@ -0,0 +1,53 @@ +package errorpageclassifier + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestErrorPageClassifier(t *testing.T) { + t.Run("test creation of new ErrorPageClassifier", func(t *testing.T) { + epc := New() + assert.NotNil(t, epc) + }) + + t.Run("test classification non error page text", func(t *testing.T) { + epc := New() + assert.Equal(t, "nonerror", epc.Classify(` + + + + Terms of Service + + +

Welcome to our Terms of Service page.

+

Understand our conditions for providing services.

+ + + `)) + }) + + t.Run("test classification on error page text", func(t *testing.T) { + epc := New() + assert.Equal(t, "error", epc.Classify(` + + + Error 403: Forbidden + + + +
+

Error 403: Forbidden

+

Sorry you don't have access rights to this page.

+
+ + + `)) + }) +} diff --git a/go.mod b/go.mod index 0a1bd6c9..9ddf2b4f 100644 --- a/go.mod +++ b/go.mod @@ -82,9 +82,11 @@ require ( github.com/google/go-querystring v1.1.0 // indirect github.com/gorilla/css v1.0.0 // indirect github.com/hashicorp/go-version v1.6.0 // indirect + github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/kataras/jwt v0.1.8 // indirect github.com/klauspost/compress v1.15.15 // indirect + github.com/kljensen/snowball v0.8.0 // indirect github.com/lucasb-eyer/go-colorful v1.2.0 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect @@ -111,6 +113,7 @@ require ( github.com/sashabaranov/go-openai v1.9.1 // indirect github.com/shirou/gopsutil/v3 v3.23.5 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect + github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect github.com/syndtr/goleveldb v1.0.0 // indirect github.com/tidwall/btree v1.6.0 // indirect github.com/tidwall/buntdb v1.3.0 // indirect diff --git a/go.sum b/go.sum index 2dd929fa..5db5a20b 100644 --- a/go.sum +++ b/go.sum @@ -104,6 +104,8 @@ github.com/hbakhtiyor/strsim v0.0.0-20190107154042-4d2bbb273edf/go.mod h1:V99KdS github.com/hdm/jarm-go v0.0.7 h1:Eq0geenHrBSYuKrdVhrBdMMzOmA+CAMLzN2WrF3eL6A= github.com/hdm/jarm-go v0.0.7/go.mod h1:kinGoS0+Sdn1Rr54OtanET5E5n7AlD6T6CrJAKDjJSQ= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= +github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 h1:iCHtR9CQyktQ5+f3dMVZfwD2KWJUgm7M0gdL9NGr8KA= +github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/julienschmidt/httprouter v1.3.0 h1:U0609e9tgbseu3rBINet9P48AI/D3oJs4dN7jwJOQ1U= @@ -114,6 +116,8 @@ github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0 github.com/klauspost/compress v1.15.15 h1:EF27CXIuDsYJ6mmvtBRlEuB2UVOqHG1tAXgZ7yIO+lw= github.com/klauspost/compress v1.15.15/go.mod h1:ZcK2JAFqKOpnBlxcLsJzYfrS9X1akm9fHZNnD9+Vo/4= github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= +github.com/kljensen/snowball v0.8.0 h1:WU4cExxK6sNW33AiGdbn4e8RvloHrhkAssu2mVJ11kg= +github.com/kljensen/snowball v0.8.0/go.mod h1:OGo5gFWjaeXqCu4iIrMl5OYip9XUJHGOU5eSkPjVg2A= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= @@ -247,6 +251,8 @@ github.com/sirupsen/logrus v1.3.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPx github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= +github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo= +github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= diff --git a/runner/options.go b/runner/options.go index d2b30e09..6ba1b955 100644 --- a/runner/options.go +++ b/runner/options.go @@ -158,6 +158,7 @@ type Options struct { OutputMatchStatusCode string OutputMatchContentLength string OutputFilterStatusCode string + OutputFilterErrorPage bool OutputFilterContentLength string InputRawRequest string rawRequest string @@ -334,6 +335,7 @@ func ParseOptions() *Options { flagSet.CreateGroup("filters", "Filters", flagSet.StringVarP(&options.OutputFilterStatusCode, "filter-code", "fc", "", "filter response with specified status code (-fc 403,401)"), + flagSet.BoolVarP(&options.OutputFilterErrorPage, "filter-error-page", "fep", false, "filter response with ML based error page detection"), flagSet.StringVarP(&options.OutputFilterContentLength, "filter-length", "fl", "", "filter response with specified content length (-fl 23,33)"), flagSet.StringVarP(&options.OutputFilterLinesCount, "filter-line-count", "flc", "", "filter response body with specified line count (-flc 423,532)"), flagSet.StringVarP(&options.OutputFilterWordsCount, "filter-word-count", "fwc", "", "filter response body with specified word count (-fwc 423,532)"), diff --git a/runner/runner.go b/runner/runner.go index 686491e6..b18d3818 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -27,6 +27,7 @@ import ( asnmap "github.com/projectdiscovery/asnmap/libs" dsl "github.com/projectdiscovery/dsl" "github.com/projectdiscovery/httpx/common/customextract" + "github.com/projectdiscovery/httpx/common/errorpageclassifier" "github.com/projectdiscovery/httpx/common/hashes/jarm" "github.com/projectdiscovery/mapcidr/asn" errorutil "github.com/projectdiscovery/utils/errors" @@ -66,15 +67,16 @@ import ( // Runner is a client for running the enumeration process. type Runner struct { - options *Options - hp *httpx.HTTPX - wappalyzer *wappalyzer.Wappalyze - scanopts ScanOptions - hm *hybrid.HybridMap - stats clistats.StatisticsClient - ratelimiter ratelimit.Limiter - HostErrorsCache gcache.Cache[string, int] - browser *Browser + options *Options + hp *httpx.HTTPX + wappalyzer *wappalyzer.Wappalyze + scanopts ScanOptions + hm *hybrid.HybridMap + stats clistats.StatisticsClient + ratelimiter ratelimit.Limiter + HostErrorsCache gcache.Cache[string, int] + browser *Browser + errorPageClassifier *errorpageclassifier.ErrorPageClassifier } // New creates a new client for running enumeration process. @@ -308,6 +310,8 @@ func New(options *Options) (*Runner, error) { runner.HostErrorsCache = gc } + runner.errorPageClassifier = errorpageclassifier.New() + return runner, nil } @@ -735,6 +739,9 @@ func (r *Runner) RunEnumeration() { } } + if r.options.OutputFilterErrorPage && resp.KnowledgeBase["PageType"] == "error" { + continue + } if len(r.options.filterStatusCode) > 0 && slice.IntSliceContains(r.options.filterStatusCode, resp.StatusCode) { continue } @@ -1753,6 +1760,9 @@ retry: ScreenshotBytes: screenshotBytes, ScreenshotPath: screenshotPath, HeadlessBody: headlessBody, + KnowledgeBase: map[string]interface{}{ + "PageType": r.errorPageClassifier.Classify(respData), + }, } if r.options.OnResult != nil { r.options.OnResult(result) diff --git a/runner/types.go b/runner/types.go index ac05f39f..a482925c 100644 --- a/runner/types.go +++ b/runner/types.go @@ -76,6 +76,7 @@ type Result struct { ScreenshotBytes []byte `json:"screenshot_bytes,omitempty" csv:"screenshot_bytes"` StoredResponsePath string `json:"stored_response_path,omitempty" csv:"stored_response_path"` ScreenshotPath string `json:"screenshot_path,omitempty" csv:"screenshot_path"` + KnowledgeBase map[string]interface{} `json:"knowledgebase,omitempty" csv:"knowledgebase"` } // function to get dsl variables from result struct From 4cfaadca16bd6939edb21b0226dce47d04d47905 Mon Sep 17 00:00:00 2001 From: mzack Date: Thu, 22 Jun 2023 06:52:53 +0200 Subject: [PATCH 02/11] fixing tests --- runner/runner.go | 7 ++++++- runner/types.go | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/runner/runner.go b/runner/runner.go index 2a302ab6..2b5b1ca2 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -709,8 +709,13 @@ func (r *Runner) RunEnumeration() { gologger.Warning().Msgf("Could not decode response: %s\n", err) continue } - dslVars, _ := dslVariables() + dslVars, err := dslVariables() + if err != nil { + gologger.Warning().Msgf("Could not retrieve dsl variables: %s\n", err) + continue + } flatMap := make(map[string]interface{}) + for _, v := range dslVars { flatMap[v] = rawMap[v] } diff --git a/runner/types.go b/runner/types.go index a482925c..1135bea2 100644 --- a/runner/types.go +++ b/runner/types.go @@ -82,7 +82,7 @@ type Result struct { // function to get dsl variables from result struct func dslVariables() ([]string, error) { fakeResult := Result{} - fieldsToIgnore := []string{"Hashes", "ResponseHeader", "Err"} + fieldsToIgnore := []string{"Hashes", "ResponseHeader", "Err", "KnowledgeBase"} if err := faker.FakeData(&fakeResult, options.WithFieldsToIgnore(fieldsToIgnore...)); err != nil { return nil, err } From bd654420566ea1a33e7044d7fc52be22cd6c46eb Mon Sep 17 00:00:00 2001 From: mzack Date: Thu, 22 Jun 2023 07:09:50 +0200 Subject: [PATCH 03/11] fixing lint issues --- go.mod | 6 +++--- go.sum | 4 ++-- runner/runner.go | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index fc82e542..557941bb 100644 --- a/go.mod +++ b/go.mod @@ -39,11 +39,13 @@ require ( github.com/bxcodec/faker/v4 v4.0.0-beta.3 github.com/go-rod/rod v0.113.3 github.com/hdm/jarm-go v0.0.7 + github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 + github.com/kljensen/snowball v0.8.0 github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6 github.com/mitchellh/mapstructure v1.5.0 github.com/projectdiscovery/asnmap v1.0.4 github.com/projectdiscovery/dsl v0.0.10 - github.com/projectdiscovery/fastdialer v0.0.31 + github.com/projectdiscovery/fastdialer v0.0.32-0.20230622050633-937580e0c57d github.com/projectdiscovery/ratelimit v0.0.8 github.com/projectdiscovery/tlsx v1.1.0 github.com/projectdiscovery/utils v0.0.38 @@ -82,11 +84,9 @@ require ( github.com/google/go-querystring v1.1.0 // indirect github.com/gorilla/css v1.0.0 // indirect github.com/hashicorp/go-version v1.6.0 // indirect - github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/kataras/jwt v0.1.8 // indirect github.com/klauspost/compress v1.15.15 // indirect - github.com/kljensen/snowball v0.8.0 // indirect github.com/lucasb-eyer/go-colorful v1.2.0 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect diff --git a/go.sum b/go.sum index 0fde449c..c332984a 100644 --- a/go.sum +++ b/go.sum @@ -193,8 +193,8 @@ github.com/projectdiscovery/clistats v0.0.18 h1:WLQNqLXsKvjoieDwXJO/1jlnxR0x9vdF github.com/projectdiscovery/clistats v0.0.18/go.mod h1:YUnUrMHFw+FHwUTIKr1KDUwz81x+SFjPU3xfLqXfzf0= github.com/projectdiscovery/dsl v0.0.10 h1:soBL/dgyCYC9cf3BY8YzVs0sI/dFVztOOQPaaza5TfQ= github.com/projectdiscovery/dsl v0.0.10/go.mod h1:bpJD7YUHBx2D0obqI4jdRVepBGTJZ8p+86j3WNb0QXs= -github.com/projectdiscovery/fastdialer v0.0.31 h1:eu0wTBCWjT8dXChmBtnQaAxoFpkLdvq0VroRxZoe/M8= -github.com/projectdiscovery/fastdialer v0.0.31/go.mod h1:ttLvt0xnpNQAStYYQ6ElIBHfSXHuPEiXBkLH/OLbYlc= +github.com/projectdiscovery/fastdialer v0.0.32-0.20230622050633-937580e0c57d h1:4tzZg3Rs3UIRiMutSopnTlPXPz82Au+6mNb4x0weq30= +github.com/projectdiscovery/fastdialer v0.0.32-0.20230622050633-937580e0c57d/go.mod h1:ttLvt0xnpNQAStYYQ6ElIBHfSXHuPEiXBkLH/OLbYlc= github.com/projectdiscovery/fdmax v0.0.4 h1:K9tIl5MUZrEMzjvwn/G4drsHms2aufTn1xUdeVcmhmc= github.com/projectdiscovery/fdmax v0.0.4/go.mod h1:oZLqbhMuJ5FmcoaalOm31B1P4Vka/CqP50nWjgtSz+I= github.com/projectdiscovery/freeport v0.0.5 h1:jnd3Oqsl4S8n0KuFkE5Hm8WGDP24ITBvmyw5pFTHS8Q= diff --git a/runner/runner.go b/runner/runner.go index 2b5b1ca2..8e79b17d 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -26,6 +26,7 @@ import ( "github.com/PuerkitoBio/goquery" asnmap "github.com/projectdiscovery/asnmap/libs" dsl "github.com/projectdiscovery/dsl" + "github.com/projectdiscovery/fastdialer/fastdialer" "github.com/projectdiscovery/httpx/common/customextract" "github.com/projectdiscovery/httpx/common/errorpageclassifier" "github.com/projectdiscovery/httpx/common/hashes/jarm" @@ -1131,7 +1132,7 @@ retry: } else { requestIP = target.CustomIP } - ctx := context.WithValue(context.Background(), "ip", requestIP) //nolint + ctx := context.WithValue(context.Background(), fastdialer.IP, requestIP) req, err = hp.NewRequestWithContext(ctx, method, URL.String()) } else { req, err = hp.NewRequest(method, URL.String()) From ae1d9511df557e7850c47636cb99091a38f5cbae Mon Sep 17 00:00:00 2001 From: mzack Date: Thu, 22 Jun 2023 07:27:16 +0200 Subject: [PATCH 04/11] memory optimization --- common/errorpageclassifier/classifier.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/errorpageclassifier/classifier.go b/common/errorpageclassifier/classifier.go index c6553b2b..ac3aa1d4 100644 --- a/common/errorpageclassifier/classifier.go +++ b/common/errorpageclassifier/classifier.go @@ -16,7 +16,7 @@ import ( var ( cleaner = regexp.MustCompile(`[^\w\s]`) - stopWords = map[string]bool{"a": true, "able": true, "about": true, "above": true, "abroad": true, "according": true, "accordingly": true, "across": true, "actually": true, "adj": true, "after": true, "afterwards": true, "again": true, "against": true, "ago": true, "ahead": true, "ain't": true, "all": true, "allow": true, "allows": true, "almost": true, "alone": true, "along": true, "alongside": true, "already": true, "also": true, "although": true, "always": true, "am": true, "amid": true, "amidst": true, "among": true, "amongst": true, "an": true, "and": true, "another": true, "any": true, "anybody": true, "anyhow": true, "anyone": true, "anything": true, "anyway": true, "anyways": true, "anywhere": true, "apart": true, "appear": true, "appreciate": true, "appropriate": true, "are": true, "aren't": true, "around": true, "as": true, "a's": true, "aside": true, "ask": true, "asking": true, "associated": true, "at": true, "available": true, "away": true, "awfully": true, "b": true, "back": true, "backward": true, "backwards": true, "be": true, "became": true, "because": true, "become": true, "becomes": true, "becoming": true, "been": true, "before": true, "beforehand": true, "begin": true, "behind": true, "being": true, "believe": true, "below": true, "beside": true, "besides": true, "best": true, "better": true, "between": true, "beyond": true, "both": true, "brief": true, "but": true, "by": true, "c": true, "came": true, "can": true, "cannot": true, "cant": true, "can't": true, "caption": true, "cause": true, "causes": true, "certain": true, "certainly": true, "changes": true, "clearly": true, "c'mon": true, "co": true, "co.": true, "com": true, "come": true, "comes": true, "concerning": true, "consequently": true, "consider": true, "considering": true, "contain": true, "containing": true, "contains": true, "corresponding": true, "could": true, "couldn't": true, "course": true, "c's": true, "currently": true, "d": true, "dare": true, "daren't": true, "definitely": true, "described": true, "despite": true, "did": true, "didn't": true, "different": true, "directly": true, "do": true, "does": true, "doesn't": true, "doing": true, "done": true, "don't": true, "down": true, "downwards": true, "during": true, "e": true, "each": true, "edu": true, "eg": true, "eight": true, "eighty": true, "either": true, "else": true, "elsewhere": true, "end": true, "ending": true, "enough": true, "entirely": true, "especially": true, "et": true, "etc": true, "even": true, "ever": true, "evermore": true, "every": true, "everybody": true, "everyone": true, "everything": true, "everywhere": true, "ex": true, "exactly": true, "example": true, "except": true, "f": true, "fairly": true, "far": true, "farther": true, "few": true, "fewer": true, "fifth": true, "first": true, "five": true, "followed": true, "following": true, "follows": true, "for": true, "forever": true, "former": true, "formerly": true, "forth": true, "forward": true, "found": true, "four": true, "from": true, "further": true, "furthermore": true, "g": true, "get": true, "gets": true, "getting": true, "given": true, "gives": true, "go": true, "goes": true, "going": true, "gone": true, "got": true, "gotten": true, "greetings": true, "h": true, "had": true, "hadn't": true, "half": true, "happens": true, "hardly": true, "has": true, "hasn't": true, "have": true, "haven't": true, "having": true, "he": true, "he'd": true, "he'll": true, "hello": true, "help": true, "hence": true, "her": true, "here": true, "hereafter": true, "hereby": true, "herein": true, "here's": true, "hereupon": true, "hers": true, "herself": true, "he's": true, "hi": true, "him": true, "himself": true, "his": true, "hither": true, "hopefully": true, "how": true, "howbeit": true, "however": true, "hundred": true, "i": true, "i'd": true, "ie": true, "if": true, "ignored": true, "i'll": true, "i'm": true, "immediate": true, "in": true, "inasmuch": true, "inc": true, "inc.": true, "indeed": true, "indicate": true, "indicated": true, "indicates": true, "inner": true, "inside": true, "insofar": true, "instead": true, "into": true, "inward": true, "is": true, "isn't": true, "it": true, "it'd": true, "it'll": true, "its": true, "it's": true, "itself": true, "i've": true, "j": true, "just": true, "k": true, "keep": true, "keeps": true, "kept": true, "know": true, "known": true, "knows": true, "l": true, "last": true, "lately": true, "later": true, "latter": true, "latterly": true, "least": true, "less": true, "lest": true, "let": true, "let's": true, "like": true, "liked": true, "likely": true, "likewise": true, "little": true, "look": true, "looking": true, "looks": true, "low": true, "lower": true, "ltd": true, "m": true, "made": true, "mainly": true, "make": true, "makes": true, "many": true, "may": true, "maybe": true, "mayn't": true, "me": true, "mean": true, "meantime": true, "meanwhile": true, "merely": true, "might": true, "mightn't": true, "mine": true, "minus": true, "miss": true, "more": true, "moreover": true, "most": true, "mostly": true, "mr": true, "mrs": true, "much": true, "must": true, "mustn't": true, "my": true, "myself": true, "n": true, "name": true, "namely": true, "nd": true, "near": true, "nearly": true, "necessary": true, "need": true, "needn't": true, "needs": true, "neither": true, "never": true, "neverf": true, "neverless": true, "nevertheless": true, "new": true, "next": true, "nine": true, "ninety": true, "no": true, "nobody": true, "non": true, "none": true, "nonetheless": true, "noone": true, "no-one": true, "nor": true, "normally": true, "not": true, "nothing": true, "notwithstanding": true, "novel": true, "now": true, "nowhere": true, "o": true, "obviously": true, "of": true, "off": true, "often": true, "oh": true, "ok": true, "okay": true, "old": true, "on": true, "once": true, "one": true, "ones": true, "one's": true, "only": true, "onto": true, "opposite": true, "or": true, "other": true, "others": true, "otherwise": true, "ought": true, "oughtn't": true, "our": true, "ours": true, "ourselves": true, "out": true, "outside": true, "over": true, "overall": true, "own": true, "p": true, "particular": true, "particularly": true, "past": true, "per": true, "perhaps": true, "placed": true, "please": true, "plus": true, "possible": true, "presumably": true, "probably": true, "provided": true, "provides": true, "q": true, "que": true, "quite": true, "qv": true, "r": true, "rather": true, "rd": true, "re": true, "really": true, "reasonably": true, "recent": true, "recently": true, "regarding": true, "regardless": true, "regards": true, "relatively": true, "respectively": true, "right": true, "round": true, "s": true, "said": true, "same": true, "saw": true, "say": true, "saying": true, "says": true, "second": true, "secondly": true, "see": true, "seeing": true, "seem": true, "seemed": true, "seeming": true, "seems": true, "seen": true, "self": true, "selves": true, "sensible": true, "sent": true, "serious": true, "seriously": true, "seven": true, "several": true, "shall": true, "shan't": true, "she": true, "she'd": true, "she'll": true, "she's": true, "should": true, "shouldn't": true, "since": true, "six": true, "so": true, "some": true, "somebody": true, "someday": true, "somehow": true, "someone": true, "something": true, "sometime": true, "sometimes": true, "somewhat": true, "somewhere": true, "soon": true, "sorry": true, "specified": true, "specify": true, "specifying": true, "still": true, "sub": true, "such": true, "sup": true, "sure": true, "t": true, "take": true, "taken": true, "taking": true, "tell": true, "tends": true, "th": true, "than": true, "thank": true, "thanks": true, "thanx": true, "that": true, "that'll": true, "thats": true, "that's": true, "that've": true, "the": true, "their": true, "theirs": true, "them": true, "themselves": true, "then": true, "thence": true, "there": true, "thereafter": true, "thereby": true, "there'd": true, "therefore": true, "therein": true, "there'll": true, "there're": true, "theres": true, "there's": true, "thereupon": true, "there've": true, "these": true, "they": true, "they'd": true, "they'll": true, "they're": true, "they've": true, "thing": true, "things": true, "think": true, "third": true, "thirty": true, "this": true, "thorough": true, "thoroughly": true, "those": true, "though": true, "three": true, "through": true, "throughout": true, "thru": true, "thus": true, "till": true, "to": true, "together": true, "too": true, "took": true, "toward": true, "towards": true, "tried": true, "tries": true, "truly": true, "try": true, "trying": true, "t's": true, "twice": true, "two": true, "u": true, "un": true, "under": true, "underneath": true, "undoing": true, "unfortunately": true, "unless": true, "unlike": true, "unlikely": true, "until": true, "unto": true, "up": true, "upon": true, "upwards": true, "us": true, "use": true, "used": true, "useful": true, "uses": true, "using": true, "usually": true, "v": true, "value": true, "various": true, "versus": true, "very": true, "via": true, "viz": true, "vs": true, "w": true, "want": true, "wants": true, "was": true, "wasn't": true, "way": true, "we": true, "we'd": true, "welcome": true, "well": true, "we'll": true, "went": true, "were": true, "we're": true, "weren't": true, "we've": true, "what": true, "whatever": true, "what'll": true, "what's": true, "what've": true, "when": true, "whence": true, "whenever": true, "where": true, "whereafter": true, "whereas": true, "whereby": true, "wherein": true, "where's": true, "whereupon": true, "wherever": true, "whether": true, "which": true, "whichever": true, "while": true, "whilst": true, "whither": true, "who": true, "who'd": true, "whoever": true, "whole": true, "who'll": true, "whom": true, "whomever": true, "who's": true, "whose": true, "why": true, "will": true, "willing": true, "wish": true, "with": true, "within": true, "without": true, "wonder": true, "won't": true, "would": true, "wouldn't": true, "x": true, "y": true, "yes": true, "yet": true, "you": true, "you'd": true, "you'll": true, "your": true, "you're": true, "yours": true, "yourself": true, "yourselves": true, "you've": true, "z": true, "zero": true} + stopWords = map[string]struct{}{"a": {}, "able": {}, "about": {}, "above": {}, "abroad": {}, "according": {}, "accordingly": {}, "across": {}, "actually": {}, "adj": {}, "after": {}, "afterwards": {}, "again": {}, "against": {}, "ago": {}, "ahead": {}, "ain't": {}, "all": {}, "allow": {}, "allows": {}, "almost": {}, "alone": {}, "along": {}, "alongside": {}, "already": {}, "also": {}, "although": {}, "always": {}, "am": {}, "amid": {}, "amidst": {}, "among": {}, "amongst": {}, "an": {}, "and": {}, "another": {}, "any": {}, "anybody": {}, "anyhow": {}, "anyone": {}, "anything": {}, "anyway": {}, "anyways": {}, "anywhere": {}, "apart": {}, "appear": {}, "appreciate": {}, "appropriate": {}, "are": {}, "aren't": {}, "around": {}, "as": {}, "a's": {}, "aside": {}, "ask": {}, "asking": {}, "associated": {}, "at": {}, "available": {}, "away": {}, "awfully": {}, "b": {}, "back": {}, "backward": {}, "backwards": {}, "be": {}, "became": {}, "because": {}, "become": {}, "becomes": {}, "becoming": {}, "been": {}, "before": {}, "beforehand": {}, "begin": {}, "behind": {}, "being": {}, "believe": {}, "below": {}, "beside": {}, "besides": {}, "best": {}, "better": {}, "between": {}, "beyond": {}, "both": {}, "brief": {}, "but": {}, "by": {}, "c": {}, "came": {}, "can": {}, "cannot": {}, "cant": {}, "can't": {}, "caption": {}, "cause": {}, "causes": {}, "certain": {}, "certainly": {}, "changes": {}, "clearly": {}, "c'mon": {}, "co": {}, "co.": {}, "com": {}, "come": {}, "comes": {}, "concerning": {}, "consequently": {}, "consider": {}, "considering": {}, "contain": {}, "containing": {}, "contains": {}, "corresponding": {}, "could": {}, "couldn't": {}, "course": {}, "c's": {}, "currently": {}, "d": {}, "dare": {}, "daren't": {}, "definitely": {}, "described": {}, "despite": {}, "did": {}, "didn't": {}, "different": {}, "directly": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "done": {}, "don't": {}, "down": {}, "downwards": {}, "during": {}, "e": {}, "each": {}, "edu": {}, "eg": {}, "eight": {}, "eighty": {}, "either": {}, "else": {}, "elsewhere": {}, "end": {}, "ending": {}, "enough": {}, "entirely": {}, "especially": {}, "et": {}, "etc": {}, "even": {}, "ever": {}, "evermore": {}, "every": {}, "everybody": {}, "everyone": {}, "everything": {}, "everywhere": {}, "ex": {}, "exactly": {}, "example": {}, "except": {}, "f": {}, "fairly": {}, "far": {}, "farther": {}, "few": {}, "fewer": {}, "fifth": {}, "first": {}, "five": {}, "followed": {}, "following": {}, "follows": {}, "for": {}, "forever": {}, "former": {}, "formerly": {}, "forth": {}, "forward": {}, "found": {}, "four": {}, "from": {}, "further": {}, "furthermore": {}, "g": {}, "get": {}, "gets": {}, "getting": {}, "given": {}, "gives": {}, "go": {}, "goes": {}, "going": {}, "gone": {}, "got": {}, "gotten": {}, "greetings": {}, "h": {}, "had": {}, "hadn't": {}, "half": {}, "happens": {}, "hardly": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "hello": {}, "help": {}, "hence": {}, "her": {}, "here": {}, "hereafter": {}, "hereby": {}, "herein": {}, "here's": {}, "hereupon": {}, "hers": {}, "herself": {}, "he's": {}, "hi": {}, "him": {}, "himself": {}, "his": {}, "hither": {}, "hopefully": {}, "how": {}, "howbeit": {}, "however": {}, "hundred": {}, "i": {}, "i'd": {}, "ie": {}, "if": {}, "ignored": {}, "i'll": {}, "i'm": {}, "immediate": {}, "in": {}, "inasmuch": {}, "inc": {}, "inc.": {}, "indeed": {}, "indicate": {}, "indicated": {}, "indicates": {}, "inner": {}, "inside": {}, "insofar": {}, "instead": {}, "into": {}, "inward": {}, "is": {}, "isn't": {}, "it": {}, "it'd": {}, "it'll": {}, "its": {}, "it's": {}, "itself": {}, "i've": {}, "j": {}, "just": {}, "k": {}, "keep": {}, "keeps": {}, "kept": {}, "know": {}, "known": {}, "knows": {}, "l": {}, "last": {}, "lately": {}, "later": {}, "latter": {}, "latterly": {}, "least": {}, "less": {}, "lest": {}, "let": {}, "let's": {}, "like": {}, "liked": {}, "likely": {}, "likewise": {}, "little": {}, "look": {}, "looking": {}, "looks": {}, "low": {}, "lower": {}, "ltd": {}, "m": {}, "made": {}, "mainly": {}, "make": {}, "makes": {}, "many": {}, "may": {}, "maybe": {}, "mayn't": {}, "me": {}, "mean": {}, "meantime": {}, "meanwhile": {}, "merely": {}, "might": {}, "mightn't": {}, "mine": {}, "minus": {}, "miss": {}, "more": {}, "moreover": {}, "most": {}, "mostly": {}, "mr": {}, "mrs": {}, "much": {}, "must": {}, "mustn't": {}, "my": {}, "myself": {}, "n": {}, "name": {}, "namely": {}, "nd": {}, "near": {}, "nearly": {}, "necessary": {}, "need": {}, "needn't": {}, "needs": {}, "neither": {}, "never": {}, "neverf": {}, "neverless": {}, "nevertheless": {}, "new": {}, "next": {}, "nine": {}, "ninety": {}, "no": {}, "nobody": {}, "non": {}, "none": {}, "nonetheless": {}, "noone": {}, "no-one": {}, "nor": {}, "normally": {}, "not": {}, "nothing": {}, "notwithstanding": {}, "novel": {}, "now": {}, "nowhere": {}, "o": {}, "obviously": {}, "of": {}, "off": {}, "often": {}, "oh": {}, "ok": {}, "okay": {}, "old": {}, "on": {}, "once": {}, "one": {}, "ones": {}, "one's": {}, "only": {}, "onto": {}, "opposite": {}, "or": {}, "other": {}, "others": {}, "otherwise": {}, "ought": {}, "oughtn't": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "outside": {}, "over": {}, "overall": {}, "own": {}, "p": {}, "particular": {}, "particularly": {}, "past": {}, "per": {}, "perhaps": {}, "placed": {}, "please": {}, "plus": {}, "possible": {}, "presumably": {}, "probably": {}, "provided": {}, "provides": {}, "q": {}, "que": {}, "quite": {}, "qv": {}, "r": {}, "rather": {}, "rd": {}, "re": {}, "really": {}, "reasonably": {}, "recent": {}, "recently": {}, "regarding": {}, "regardless": {}, "regards": {}, "relatively": {}, "respectively": {}, "right": {}, "round": {}, "s": {}, "said": {}, "same": {}, "saw": {}, "say": {}, "saying": {}, "says": {}, "second": {}, "secondly": {}, "see": {}, "seeing": {}, "seem": {}, "seemed": {}, "seeming": {}, "seems": {}, "seen": {}, "self": {}, "selves": {}, "sensible": {}, "sent": {}, "serious": {}, "seriously": {}, "seven": {}, "several": {}, "shall": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "since": {}, "six": {}, "so": {}, "some": {}, "somebody": {}, "someday": {}, "somehow": {}, "someone": {}, "something": {}, "sometime": {}, "sometimes": {}, "somewhat": {}, "somewhere": {}, "soon": {}, "sorry": {}, "specified": {}, "specify": {}, "specifying": {}, "still": {}, "sub": {}, "such": {}, "sup": {}, "sure": {}, "t": {}, "take": {}, "taken": {}, "taking": {}, "tell": {}, "tends": {}, "th": {}, "than": {}, "thank": {}, "thanks": {}, "thanx": {}, "that": {}, "that'll": {}, "thats": {}, "that's": {}, "that've": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "thence": {}, "there": {}, "thereafter": {}, "thereby": {}, "there'd": {}, "therefore": {}, "therein": {}, "there'll": {}, "there're": {}, "theres": {}, "there's": {}, "thereupon": {}, "there've": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "thing": {}, "things": {}, "think": {}, "third": {}, "thirty": {}, "this": {}, "thorough": {}, "thoroughly": {}, "those": {}, "though": {}, "three": {}, "through": {}, "throughout": {}, "thru": {}, "thus": {}, "till": {}, "to": {}, "together": {}, "too": {}, "took": {}, "toward": {}, "towards": {}, "tried": {}, "tries": {}, "truly": {}, "try": {}, "trying": {}, "t's": {}, "twice": {}, "two": {}, "u": {}, "un": {}, "under": {}, "underneath": {}, "undoing": {}, "unfortunately": {}, "unless": {}, "unlike": {}, "unlikely": {}, "until": {}, "unto": {}, "up": {}, "upon": {}, "upwards": {}, "us": {}, "use": {}, "used": {}, "useful": {}, "uses": {}, "using": {}, "usually": {}, "v": {}, "value": {}, "various": {}, "versus": {}, "very": {}, "via": {}, "viz": {}, "vs": {}, "w": {}, "want": {}, "wants": {}, "was": {}, "wasn't": {}, "way": {}, "we": {}, "we'd": {}, "welcome": {}, "well": {}, "we'll": {}, "went": {}, "were": {}, "we're": {}, "weren't": {}, "we've": {}, "what": {}, "whatever": {}, "what'll": {}, "what's": {}, "what've": {}, "when": {}, "whence": {}, "whenever": {}, "where": {}, "whereafter": {}, "whereas": {}, "whereby": {}, "wherein": {}, "where's": {}, "whereupon": {}, "wherever": {}, "whether": {}, "which": {}, "whichever": {}, "while": {}, "whilst": {}, "whither": {}, "who": {}, "who'd": {}, "whoever": {}, "whole": {}, "who'll": {}, "whom": {}, "whomever": {}, "who's": {}, "whose": {}, "why": {}, "will": {}, "willing": {}, "wish": {}, "with": {}, "within": {}, "without": {}, "wonder": {}, "won't": {}, "would": {}, "wouldn't": {}, "x": {}, "y": {}, "yes": {}, "yet": {}, "you": {}, "you'd": {}, "you'll": {}, "your": {}, "you're": {}, "yours": {}, "yourself": {}, "yourselves": {}, "you've": {}, "z": {}, "zero": {}} ) type Sorted struct { @@ -172,7 +172,7 @@ func countWords(document string) (wordCount map[string]int) { words := strings.Split(cleaned, " ") wordCount = make(map[string]int) for _, word := range words { - if !stopWords[word] { + if _, ok := stopWords[word]; !ok { key := stem(strings.ToLower(word)) wordCount[key]++ } From e991ff19089ecc9749aff4108449b480fd96e353 Mon Sep 17 00:00:00 2001 From: mzack Date: Wed, 28 Jun 2023 11:08:09 +0200 Subject: [PATCH 05/11] commenting out train logic --- common/errorpageclassifier/classifier.go | 61 +++--- common/errorpageclassifier/cm.go | 206 +++++++++--------- .../errorpageclassifier.go | 165 +++++++------- 3 files changed, 214 insertions(+), 218 deletions(-) diff --git a/common/errorpageclassifier/classifier.go b/common/errorpageclassifier/classifier.go index ac3aa1d4..c14fa84a 100644 --- a/common/errorpageclassifier/classifier.go +++ b/common/errorpageclassifier/classifier.go @@ -5,7 +5,7 @@ package errorpageclassifier import ( "bytes" "encoding/gob" - "fmt" + "io" "os" "regexp" "sort" @@ -59,23 +59,22 @@ func NewClassifierFromFile(path string) (*Classifier, error) { fl, err := os.Open(path) if err != nil { - return classifier, err } defer fl.Close() - dErr := gob.NewDecoder(fl).Decode(classifier) - if dErr != nil { - return classifier, dErr - } - - return classifier, nil + return NewClassifierWithReader(fl) } // create and initialize the classifier from a file data func NewClassifierFromFileData(data []byte) (*Classifier, error) { + return NewClassifierWithReader(bytes.NewReader(data)) +} + +// create and initialize the classifier from a file data +func NewClassifierWithReader(reader io.Reader) (*Classifier, error) { classifier := &Classifier{} - err := gob.NewDecoder(bytes.NewReader(data)).Decode(classifier) + err := gob.NewDecoder(reader).Decode(classifier) if err != nil { return classifier, err } @@ -84,31 +83,31 @@ func NewClassifierFromFileData(data []byte) (*Classifier, error) { } // save the classifier to a file -func (c *Classifier) SaveClassifierToFile(path string) error { - fl, err := os.Create(path) - if err != nil { - return err - } - defer fl.Close() +// func (c *Classifier) SaveClassifierToFile(path string) error { +// fl, err := os.Create(path) +// if err != nil { +// return err +// } +// defer fl.Close() - err = gob.NewEncoder(fl).Encode(&c) - if err != nil { - return err - } +// err = gob.NewEncoder(fl).Encode(&c) +// if err != nil { +// return err +// } - return nil -} +// return nil +// } // Train the classifier -func (c *Classifier) Train(category string, document string) { - for word, count := range countWords(document) { - c.Words[category][word] += count - c.CategoriesWords[category] += count - c.TotalWords += count - } - c.CategoriesDocuments[category]++ - c.TotalDocuments++ -} +// func (c *Classifier) Train(category string, document string) { +// for word, count := range countWords(document) { +// c.Words[category][word] += count +// c.CategoriesWords[category] += count +// c.TotalWords += count +// } +// c.CategoriesDocuments[category]++ +// c.TotalDocuments++ +// } // Classify a document func (c *Classifier) Classify(document string) (category string) { @@ -190,6 +189,6 @@ func stem(word string) string { if err == nil { return stemmed } - fmt.Println("Cannot stem word:", word) + // fmt.Println("Cannot stem word:", word) return word } diff --git a/common/errorpageclassifier/cm.go b/common/errorpageclassifier/cm.go index c66dee30..3ba6962c 100644 --- a/common/errorpageclassifier/cm.go +++ b/common/errorpageclassifier/cm.go @@ -1,105 +1,105 @@ package errorpageclassifier -import "fmt" - -type ConfusionMatrix struct { - matrix [][]int - labels []string -} - -func NewConfusionMatrix(actual, predicted []string, labels []string) *ConfusionMatrix { - n := len(labels) - matrix := make([][]int, n) - for i := range matrix { - matrix[i] = make([]int, n) - } - - labelIndices := make(map[string]int) - for i, label := range labels { - labelIndices[label] = i - } - - for i := range actual { - matrix[labelIndices[actual[i]]][labelIndices[predicted[i]]]++ - } - - return &ConfusionMatrix{ - matrix: matrix, - labels: labels, - } -} - -func (cm *ConfusionMatrix) PrintConfusionMatrix() { - fmt.Printf("%30s\n", "Confusion Matrix") - fmt.Println() - // Print header - fmt.Printf("%-15s", "") - for _, label := range cm.labels { - fmt.Printf("%-15s", label) - } - fmt.Println() - - // Print rows - for i, row := range cm.matrix { - fmt.Printf("%-15s", cm.labels[i]) - for _, value := range row { - fmt.Printf("%-15d", value) - } - fmt.Println() - } - fmt.Println() -} - -func (cm *ConfusionMatrix) PrintClassificationReport() { - fmt.Printf("%30s\n", "Classification Report") - fmt.Println() - - fmt.Printf("\n%-15s %-10s %-10s %-10s %-10s\n", "", "precision", "recall", "f1-score", "support") - - totals := map[string]float64{"true": 0, "predicted": 0, "correct": 0} - macroAvg := map[string]float64{"precision": 0, "recall": 0, "f1-score": 0} - - for i, label := range cm.labels { - truePos := cm.matrix[i][i] - falsePos, falseNeg := 0, 0 - for j := 0; j < len(cm.labels); j++ { - if i != j { - falsePos += cm.matrix[j][i] - falseNeg += cm.matrix[i][j] - } - } - - precision := float64(truePos) / float64(truePos+falsePos) - recall := float64(truePos) / float64(truePos+falseNeg) - f1Score := 2 * precision * recall / (precision + recall) - support := truePos + falseNeg - - fmt.Printf("%-15s %-10.2f %-10.2f %-10.2f %-10d\n", label, precision, recall, f1Score, support) - - totals["true"] += float64(support) - totals["predicted"] += float64(truePos + falsePos) - totals["correct"] += float64(truePos) - - macroAvg["precision"] += precision - macroAvg["recall"] += recall - macroAvg["f1-score"] += f1Score - } - - accuracy := totals["correct"] / totals["true"] - fmt.Printf("\n%-26s %-10s %-10.2f %-10d", "accuracy", "", accuracy, int(totals["true"])) - - fmt.Printf("\n%-15s %-10.2f %-10.2f %-10.2f %-10d\n", "macro avg", - macroAvg["precision"]/float64(len(cm.labels)), - macroAvg["recall"]/float64(len(cm.labels)), - macroAvg["f1-score"]/float64(len(cm.labels)), - int(totals["true"])) - - precisionWeightedAvg := totals["correct"] / totals["predicted"] - recallWeightedAvg := totals["correct"] / totals["true"] - f1ScoreWeightedAvg := 2 * precisionWeightedAvg * recallWeightedAvg / (precisionWeightedAvg + recallWeightedAvg) - - fmt.Printf("%-15s %-10.2f %-10.2f %-10.2f %-10d\n", "weighted avg", - precisionWeightedAvg, recallWeightedAvg, f1ScoreWeightedAvg, int(totals["true"])) - - fmt.Println() -} +// import "fmt" + +// type ConfusionMatrix struct { +// matrix [][]int +// labels []string +// } + +// func NewConfusionMatrix(actual, predicted []string, labels []string) *ConfusionMatrix { +// n := len(labels) +// matrix := make([][]int, n) +// for i := range matrix { +// matrix[i] = make([]int, n) +// } + +// labelIndices := make(map[string]int) +// for i, label := range labels { +// labelIndices[label] = i +// } + +// for i := range actual { +// matrix[labelIndices[actual[i]]][labelIndices[predicted[i]]]++ +// } + +// return &ConfusionMatrix{ +// matrix: matrix, +// labels: labels, +// } +// } + +// func (cm *ConfusionMatrix) PrintConfusionMatrix() { +// fmt.Printf("%30s\n", "Confusion Matrix") +// fmt.Println() +// // Print header +// fmt.Printf("%-15s", "") +// for _, label := range cm.labels { +// fmt.Printf("%-15s", label) +// } +// fmt.Println() + +// // Print rows +// for i, row := range cm.matrix { +// fmt.Printf("%-15s", cm.labels[i]) +// for _, value := range row { +// fmt.Printf("%-15d", value) +// } +// fmt.Println() +// } +// fmt.Println() +// } + +// func (cm *ConfusionMatrix) PrintClassificationReport() { +// fmt.Printf("%30s\n", "Classification Report") +// fmt.Println() + +// fmt.Printf("\n%-15s %-10s %-10s %-10s %-10s\n", "", "precision", "recall", "f1-score", "support") + +// totals := map[string]float64{"true": 0, "predicted": 0, "correct": 0} +// macroAvg := map[string]float64{"precision": 0, "recall": 0, "f1-score": 0} + +// for i, label := range cm.labels { +// truePos := cm.matrix[i][i] +// falsePos, falseNeg := 0, 0 +// for j := 0; j < len(cm.labels); j++ { +// if i != j { +// falsePos += cm.matrix[j][i] +// falseNeg += cm.matrix[i][j] +// } +// } + +// precision := float64(truePos) / float64(truePos+falsePos) +// recall := float64(truePos) / float64(truePos+falseNeg) +// f1Score := 2 * precision * recall / (precision + recall) +// support := truePos + falseNeg + +// fmt.Printf("%-15s %-10.2f %-10.2f %-10.2f %-10d\n", label, precision, recall, f1Score, support) + +// totals["true"] += float64(support) +// totals["predicted"] += float64(truePos + falsePos) +// totals["correct"] += float64(truePos) + +// macroAvg["precision"] += precision +// macroAvg["recall"] += recall +// macroAvg["f1-score"] += f1Score +// } + +// accuracy := totals["correct"] / totals["true"] +// fmt.Printf("\n%-26s %-10s %-10.2f %-10d", "accuracy", "", accuracy, int(totals["true"])) + +// fmt.Printf("\n%-15s %-10.2f %-10.2f %-10.2f %-10d\n", "macro avg", +// macroAvg["precision"]/float64(len(cm.labels)), +// macroAvg["recall"]/float64(len(cm.labels)), +// macroAvg["f1-score"]/float64(len(cm.labels)), +// int(totals["true"])) + +// precisionWeightedAvg := totals["correct"] / totals["predicted"] +// recallWeightedAvg := totals["correct"] / totals["true"] +// f1ScoreWeightedAvg := 2 * precisionWeightedAvg * recallWeightedAvg / (precisionWeightedAvg + recallWeightedAvg) + +// fmt.Printf("%-15s %-10.2f %-10.2f %-10.2f %-10d\n", "weighted avg", +// precisionWeightedAvg, recallWeightedAvg, f1ScoreWeightedAvg, int(totals["true"])) + +// fmt.Println() +// } diff --git a/common/errorpageclassifier/errorpageclassifier.go b/common/errorpageclassifier/errorpageclassifier.go index ece886dd..76e5c341 100644 --- a/common/errorpageclassifier/errorpageclassifier.go +++ b/common/errorpageclassifier/errorpageclassifier.go @@ -2,28 +2,25 @@ package errorpageclassifier import ( _ "embed" - "fmt" - "math/rand" - "strings" "github.com/jaytaylor/html2text" ) -const ( - modelPath = "clf.gob" - threshold = 1.1 - testPercentage = 0.2 -) +// const ( +// modelPath = "clf.gob" +// threshold = 1.1 +// testPercentage = 0.2 +// ) -var categories = []string{"error", "nonerror"} +// var categories = []string{"error", "nonerror"} type Document struct { Class string Text string } -//go:embed dataset.txt -var dataset string +// go:embed dataset.txt +// var dataset string //go:embed clf.gob var classifierData []byte @@ -53,79 +50,79 @@ func (n *ErrorPageClassifier) Classify(html string) string { return cls } -func (epc *ErrorPageClassifier) Evaluate() { - train, test := trainTestSplit() - fmt.Println("no of docs in TRAIN dataset:", len(train)) - fmt.Println("no of docs in TEST dataset:", len(test)) - - fmt.Println("Evaluating classifier on test set:") - actualTest, predictedTest := epc.testClf(test) - confusionMatrixTest := NewConfusionMatrix(actualTest, predictedTest, []string{"error", "nonerror"}) - confusionMatrixTest.PrintConfusionMatrix() - confusionMatrixTest.PrintClassificationReport() - - fmt.Println("Evaluating classifier on the first 100 docs in the train set:") - actualValidate, predictedValidate := epc.validateClf(train[0:100]) - confusionMatrixValidate := NewConfusionMatrix(actualValidate, predictedValidate, []string{"error", "nonerror"}) - confusionMatrixValidate.PrintConfusionMatrix() - confusionMatrixValidate.PrintClassificationReport() -} - -func (epc *ErrorPageClassifier) testClf(test []Document) ([]string, []string) { - actual := []string{} - predicted := []string{} - - for _, doc := range test { - class := epc.classifier.Classify(doc.Text) - actual = append(actual, doc.Class) - predicted = append(predicted, class) - } - return actual, predicted -} - -func (epc *ErrorPageClassifier) validateClf(validation []Document) ([]string, []string) { - actual := []string{} - predicted := []string{} - - for _, doc := range validation { - actual = append(actual, doc.Class) - sentiment := epc.classifier.Classify(doc.Text) - predicted = append(predicted, sentiment) - } - return actual, predicted -} - -func TrainAndSave() { - train, test := trainTestSplit() - clf := NewClassifier(categories, threshold) - - fmt.Println("no of docs in TRAIN dataset:", len(train)) - fmt.Println("no of docs in TEST dataset:", len(test)) - - for _, doc := range train { - clf.Train(doc.Class, doc.Text) - } - - err := clf.SaveClassifierToFile(modelPath) - if err != nil { - panic(err) - } -} - -func trainTestSplit() (train, test []Document) { - data := strings.Split(dataset, "\n") - for _, line := range data { - s := strings.Split(line, "||") - doc, sentiment := s[0], s[1] - - if rand.Float64() > testPercentage { - train = append(train, Document{sentiment, doc}) - } else { - test = append(test, Document{sentiment, doc}) - } - } - return train, test -} +// func (epc *ErrorPageClassifier) Evaluate() { +// train, test := trainTestSplit() +// fmt.Println("no of docs in TRAIN dataset:", len(train)) +// fmt.Println("no of docs in TEST dataset:", len(test)) + +// fmt.Println("Evaluating classifier on test set:") +// actualTest, predictedTest := epc.testClf(test) +// confusionMatrixTest := NewConfusionMatrix(actualTest, predictedTest, []string{"error", "nonerror"}) +// confusionMatrixTest.PrintConfusionMatrix() +// confusionMatrixTest.PrintClassificationReport() + +// fmt.Println("Evaluating classifier on the first 100 docs in the train set:") +// actualValidate, predictedValidate := epc.validateClf(train[0:100]) +// confusionMatrixValidate := NewConfusionMatrix(actualValidate, predictedValidate, []string{"error", "nonerror"}) +// confusionMatrixValidate.PrintConfusionMatrix() +// confusionMatrixValidate.PrintClassificationReport() +// } + +// func (epc *ErrorPageClassifier) testClf(test []Document) ([]string, []string) { +// actual := []string{} +// predicted := []string{} + +// for _, doc := range test { +// class := epc.classifier.Classify(doc.Text) +// actual = append(actual, doc.Class) +// predicted = append(predicted, class) +// } +// return actual, predicted +// } + +// func (epc *ErrorPageClassifier) validateClf(validation []Document) ([]string, []string) { +// actual := []string{} +// predicted := []string{} + +// for _, doc := range validation { +// actual = append(actual, doc.Class) +// sentiment := epc.classifier.Classify(doc.Text) +// predicted = append(predicted, sentiment) +// } +// return actual, predicted +// } + +// func TrainAndSave() { +// train, test := trainTestSplit() +// clf := NewClassifier(categories, threshold) + +// fmt.Println("no of docs in TRAIN dataset:", len(train)) +// fmt.Println("no of docs in TEST dataset:", len(test)) + +// for _, doc := range train { +// clf.Train(doc.Class, doc.Text) +// } + +// err := clf.SaveClassifierToFile(modelPath) +// if err != nil { +// panic(err) +// } +// } + +// func trainTestSplit() (train, test []Document) { +// data := strings.Split(dataset, "\n") +// for _, line := range data { +// s := strings.Split(line, "||") +// doc, sentiment := s[0], s[1] + +// if rand.Float64() > testPercentage { +// train = append(train, Document{sentiment, doc}) +// } else { +// test = append(test, Document{sentiment, doc}) +// } +// } +// return train, test +// } func htmlToText(html string) (string, error) { text, err := html2text.FromString(html, html2text.Options{TextOnly: true}) From bbcee49994d748110c19b12b5fd40172837e0cf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Mon, 3 Jul 2023 08:18:49 +0000 Subject: [PATCH 06/11] add doc --- README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/README.md b/README.md index 7ae77290..df00b6ea 100644 --- a/README.md +++ b/README.md @@ -344,6 +344,26 @@ https://support.hackerone.com [301,302,301,200] [HackerOne] [Cloudflare,Ruby on https://resources.hackerone.com [301,301,404] [Sorry, no Folders found.] ``` +### Error Page Classifier and Filtering +The Error Page Classifier and Filtering feature aims to add intelligence to the tool by enabling it to classify and filter out common error pages returned by web applications. It is an enhancement to the existing httpx capabilities and is geared towards reducing the noise in the results and helping users focus on what matters most. + +```console +httpx -l list.txt -fep + + __ __ __ _ __ + / /_ / /_/ /_____ | |/ / + / __ \/ __/ __/ __ \| / + / / / / /_/ /_/ /_/ / | +/_/ /_/\__/\__/ .___/_/|_| + /_/ + + projectdiscovery.io + +[INF] Current httpx version v1.3.2 (latest) +https://projectdiscovery.io +https://scanme.sh +``` + ### Favicon Hash From 68736aee3760599c88c69d3614e2f4c75ce40d0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Mon, 3 Jul 2023 08:38:41 +0000 Subject: [PATCH 07/11] log filtered error pages --- runner/runner.go | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/runner/runner.go b/runner/runner.go index 8e79b17d..425c6dd9 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -746,6 +746,7 @@ func (r *Runner) RunEnumeration() { } if r.options.OutputFilterErrorPage && resp.KnowledgeBase["PageType"] == "error" { + logFilteredErrorPage(resp.URL) continue } if len(r.options.filterStatusCode) > 0 && slice.IntSliceContains(r.options.filterStatusCode, resp.StatusCode) { @@ -925,6 +926,37 @@ func (r *Runner) RunEnumeration() { wgoutput.Wait() } +func logFilteredErrorPage(url string) { + fileName := "filtered_error_page.json" + file, err := os.OpenFile(fileName, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600) + if err != nil { + gologger.Fatal().Msgf("Could not open/create output file '%s': %s\n", fileName, err) + return + } + defer file.Close() + + info := map[string]interface{}{ + "url": url, + "time_filtered": time.Now(), + } + + data, err := json.Marshal(info) + if err != nil { + fmt.Println("Failed to marshal JSON:", err) + return + } + + if _, err := file.Write(data); err != nil { + gologger.Fatal().Msgf("Failed to write to '%s': %s\n", fileName, err) + return + } + + if _, err := file.WriteString("\n"); err != nil { + gologger.Fatal().Msgf("Failed to write newline to '%s': %s\n", fileName, err) + return + } +} + func (r *Runner) GetScanOpts() ScanOptions { return r.scanopts } From 2718a9aab712ece6dff5c520fdc23f7023a9c9eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Mon, 17 Jul 2023 06:54:00 +0000 Subject: [PATCH 08/11] use mlutils --- common/errorpageclassifier/classifier.go | 194 ------------------ common/errorpageclassifier/cm.go | 105 ---------- .../errorpageclassifier.go | 110 +--------- go.mod | 18 +- go.sum | 32 +-- 5 files changed, 33 insertions(+), 426 deletions(-) delete mode 100644 common/errorpageclassifier/classifier.go delete mode 100644 common/errorpageclassifier/cm.go diff --git a/common/errorpageclassifier/classifier.go b/common/errorpageclassifier/classifier.go deleted file mode 100644 index c14fa84a..00000000 --- a/common/errorpageclassifier/classifier.go +++ /dev/null @@ -1,194 +0,0 @@ -//ref: https://github.com/sausheong/gonb - -package errorpageclassifier - -import ( - "bytes" - "encoding/gob" - "io" - "os" - "regexp" - "sort" - "strings" - - "github.com/kljensen/snowball" -) - -var ( - cleaner = regexp.MustCompile(`[^\w\s]`) - stopWords = map[string]struct{}{"a": {}, "able": {}, "about": {}, "above": {}, "abroad": {}, "according": {}, "accordingly": {}, "across": {}, "actually": {}, "adj": {}, "after": {}, "afterwards": {}, "again": {}, "against": {}, "ago": {}, "ahead": {}, "ain't": {}, "all": {}, "allow": {}, "allows": {}, "almost": {}, "alone": {}, "along": {}, "alongside": {}, "already": {}, "also": {}, "although": {}, "always": {}, "am": {}, "amid": {}, "amidst": {}, "among": {}, "amongst": {}, "an": {}, "and": {}, "another": {}, "any": {}, "anybody": {}, "anyhow": {}, "anyone": {}, "anything": {}, "anyway": {}, "anyways": {}, "anywhere": {}, "apart": {}, "appear": {}, "appreciate": {}, "appropriate": {}, "are": {}, "aren't": {}, "around": {}, "as": {}, "a's": {}, "aside": {}, "ask": {}, "asking": {}, "associated": {}, "at": {}, "available": {}, "away": {}, "awfully": {}, "b": {}, "back": {}, "backward": {}, "backwards": {}, "be": {}, "became": {}, "because": {}, "become": {}, "becomes": {}, "becoming": {}, "been": {}, "before": {}, "beforehand": {}, "begin": {}, "behind": {}, "being": {}, "believe": {}, "below": {}, "beside": {}, "besides": {}, "best": {}, "better": {}, "between": {}, "beyond": {}, "both": {}, "brief": {}, "but": {}, "by": {}, "c": {}, "came": {}, "can": {}, "cannot": {}, "cant": {}, "can't": {}, "caption": {}, "cause": {}, "causes": {}, "certain": {}, "certainly": {}, "changes": {}, "clearly": {}, "c'mon": {}, "co": {}, "co.": {}, "com": {}, "come": {}, "comes": {}, "concerning": {}, "consequently": {}, "consider": {}, "considering": {}, "contain": {}, "containing": {}, "contains": {}, "corresponding": {}, "could": {}, "couldn't": {}, "course": {}, "c's": {}, "currently": {}, "d": {}, "dare": {}, "daren't": {}, "definitely": {}, "described": {}, "despite": {}, "did": {}, "didn't": {}, "different": {}, "directly": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "done": {}, "don't": {}, "down": {}, "downwards": {}, "during": {}, "e": {}, "each": {}, "edu": {}, "eg": {}, "eight": {}, "eighty": {}, "either": {}, "else": {}, "elsewhere": {}, "end": {}, "ending": {}, "enough": {}, "entirely": {}, "especially": {}, "et": {}, "etc": {}, "even": {}, "ever": {}, "evermore": {}, "every": {}, "everybody": {}, "everyone": {}, "everything": {}, "everywhere": {}, "ex": {}, "exactly": {}, "example": {}, "except": {}, "f": {}, "fairly": {}, "far": {}, "farther": {}, "few": {}, "fewer": {}, "fifth": {}, "first": {}, "five": {}, "followed": {}, "following": {}, "follows": {}, "for": {}, "forever": {}, "former": {}, "formerly": {}, "forth": {}, "forward": {}, "found": {}, "four": {}, "from": {}, "further": {}, "furthermore": {}, "g": {}, "get": {}, "gets": {}, "getting": {}, "given": {}, "gives": {}, "go": {}, "goes": {}, "going": {}, "gone": {}, "got": {}, "gotten": {}, "greetings": {}, "h": {}, "had": {}, "hadn't": {}, "half": {}, "happens": {}, "hardly": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "hello": {}, "help": {}, "hence": {}, "her": {}, "here": {}, "hereafter": {}, "hereby": {}, "herein": {}, "here's": {}, "hereupon": {}, "hers": {}, "herself": {}, "he's": {}, "hi": {}, "him": {}, "himself": {}, "his": {}, "hither": {}, "hopefully": {}, "how": {}, "howbeit": {}, "however": {}, "hundred": {}, "i": {}, "i'd": {}, "ie": {}, "if": {}, "ignored": {}, "i'll": {}, "i'm": {}, "immediate": {}, "in": {}, "inasmuch": {}, "inc": {}, "inc.": {}, "indeed": {}, "indicate": {}, "indicated": {}, "indicates": {}, "inner": {}, "inside": {}, "insofar": {}, "instead": {}, "into": {}, "inward": {}, "is": {}, "isn't": {}, "it": {}, "it'd": {}, "it'll": {}, "its": {}, "it's": {}, "itself": {}, "i've": {}, "j": {}, "just": {}, "k": {}, "keep": {}, "keeps": {}, "kept": {}, "know": {}, "known": {}, "knows": {}, "l": {}, "last": {}, "lately": {}, "later": {}, "latter": {}, "latterly": {}, "least": {}, "less": {}, "lest": {}, "let": {}, "let's": {}, "like": {}, "liked": {}, "likely": {}, "likewise": {}, "little": {}, "look": {}, "looking": {}, "looks": {}, "low": {}, "lower": {}, "ltd": {}, "m": {}, "made": {}, "mainly": {}, "make": {}, "makes": {}, "many": {}, "may": {}, "maybe": {}, "mayn't": {}, "me": {}, "mean": {}, "meantime": {}, "meanwhile": {}, "merely": {}, "might": {}, "mightn't": {}, "mine": {}, "minus": {}, "miss": {}, "more": {}, "moreover": {}, "most": {}, "mostly": {}, "mr": {}, "mrs": {}, "much": {}, "must": {}, "mustn't": {}, "my": {}, "myself": {}, "n": {}, "name": {}, "namely": {}, "nd": {}, "near": {}, "nearly": {}, "necessary": {}, "need": {}, "needn't": {}, "needs": {}, "neither": {}, "never": {}, "neverf": {}, "neverless": {}, "nevertheless": {}, "new": {}, "next": {}, "nine": {}, "ninety": {}, "no": {}, "nobody": {}, "non": {}, "none": {}, "nonetheless": {}, "noone": {}, "no-one": {}, "nor": {}, "normally": {}, "not": {}, "nothing": {}, "notwithstanding": {}, "novel": {}, "now": {}, "nowhere": {}, "o": {}, "obviously": {}, "of": {}, "off": {}, "often": {}, "oh": {}, "ok": {}, "okay": {}, "old": {}, "on": {}, "once": {}, "one": {}, "ones": {}, "one's": {}, "only": {}, "onto": {}, "opposite": {}, "or": {}, "other": {}, "others": {}, "otherwise": {}, "ought": {}, "oughtn't": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "outside": {}, "over": {}, "overall": {}, "own": {}, "p": {}, "particular": {}, "particularly": {}, "past": {}, "per": {}, "perhaps": {}, "placed": {}, "please": {}, "plus": {}, "possible": {}, "presumably": {}, "probably": {}, "provided": {}, "provides": {}, "q": {}, "que": {}, "quite": {}, "qv": {}, "r": {}, "rather": {}, "rd": {}, "re": {}, "really": {}, "reasonably": {}, "recent": {}, "recently": {}, "regarding": {}, "regardless": {}, "regards": {}, "relatively": {}, "respectively": {}, "right": {}, "round": {}, "s": {}, "said": {}, "same": {}, "saw": {}, "say": {}, "saying": {}, "says": {}, "second": {}, "secondly": {}, "see": {}, "seeing": {}, "seem": {}, "seemed": {}, "seeming": {}, "seems": {}, "seen": {}, "self": {}, "selves": {}, "sensible": {}, "sent": {}, "serious": {}, "seriously": {}, "seven": {}, "several": {}, "shall": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "since": {}, "six": {}, "so": {}, "some": {}, "somebody": {}, "someday": {}, "somehow": {}, "someone": {}, "something": {}, "sometime": {}, "sometimes": {}, "somewhat": {}, "somewhere": {}, "soon": {}, "sorry": {}, "specified": {}, "specify": {}, "specifying": {}, "still": {}, "sub": {}, "such": {}, "sup": {}, "sure": {}, "t": {}, "take": {}, "taken": {}, "taking": {}, "tell": {}, "tends": {}, "th": {}, "than": {}, "thank": {}, "thanks": {}, "thanx": {}, "that": {}, "that'll": {}, "thats": {}, "that's": {}, "that've": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "thence": {}, "there": {}, "thereafter": {}, "thereby": {}, "there'd": {}, "therefore": {}, "therein": {}, "there'll": {}, "there're": {}, "theres": {}, "there's": {}, "thereupon": {}, "there've": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "thing": {}, "things": {}, "think": {}, "third": {}, "thirty": {}, "this": {}, "thorough": {}, "thoroughly": {}, "those": {}, "though": {}, "three": {}, "through": {}, "throughout": {}, "thru": {}, "thus": {}, "till": {}, "to": {}, "together": {}, "too": {}, "took": {}, "toward": {}, "towards": {}, "tried": {}, "tries": {}, "truly": {}, "try": {}, "trying": {}, "t's": {}, "twice": {}, "two": {}, "u": {}, "un": {}, "under": {}, "underneath": {}, "undoing": {}, "unfortunately": {}, "unless": {}, "unlike": {}, "unlikely": {}, "until": {}, "unto": {}, "up": {}, "upon": {}, "upwards": {}, "us": {}, "use": {}, "used": {}, "useful": {}, "uses": {}, "using": {}, "usually": {}, "v": {}, "value": {}, "various": {}, "versus": {}, "very": {}, "via": {}, "viz": {}, "vs": {}, "w": {}, "want": {}, "wants": {}, "was": {}, "wasn't": {}, "way": {}, "we": {}, "we'd": {}, "welcome": {}, "well": {}, "we'll": {}, "went": {}, "were": {}, "we're": {}, "weren't": {}, "we've": {}, "what": {}, "whatever": {}, "what'll": {}, "what's": {}, "what've": {}, "when": {}, "whence": {}, "whenever": {}, "where": {}, "whereafter": {}, "whereas": {}, "whereby": {}, "wherein": {}, "where's": {}, "whereupon": {}, "wherever": {}, "whether": {}, "which": {}, "whichever": {}, "while": {}, "whilst": {}, "whither": {}, "who": {}, "who'd": {}, "whoever": {}, "whole": {}, "who'll": {}, "whom": {}, "whomever": {}, "who's": {}, "whose": {}, "why": {}, "will": {}, "willing": {}, "wish": {}, "with": {}, "within": {}, "without": {}, "wonder": {}, "won't": {}, "would": {}, "wouldn't": {}, "x": {}, "y": {}, "yes": {}, "yet": {}, "you": {}, "you'd": {}, "you'll": {}, "your": {}, "you're": {}, "yours": {}, "yourself": {}, "yourselves": {}, "you've": {}, "z": {}, "zero": {}} -) - -type Sorted struct { - Category string - Probability float64 -} - -// Classifier is what we use to classify documents -type Classifier struct { - Words map[string]map[string]int - TotalWords int - CategoriesDocuments map[string]int - TotalDocuments int - CategoriesWords map[string]int - Threshold float64 -} - -// create and initialize the classifier -func NewClassifier(categories []string, threshold float64) *Classifier { - classifier := &Classifier{ - Words: make(map[string]map[string]int), - TotalWords: 0, - CategoriesDocuments: make(map[string]int), - TotalDocuments: 0, - CategoriesWords: make(map[string]int), - Threshold: threshold, - } - - for _, category := range categories { - classifier.Words[category] = make(map[string]int) - classifier.CategoriesDocuments[category] = 0 - classifier.CategoriesWords[category] = 0 - } - return classifier -} - -// create and initialize the classifier from a file -func NewClassifierFromFile(path string) (*Classifier, error) { - classifier := &Classifier{} - - fl, err := os.Open(path) - if err != nil { - return classifier, err - } - defer fl.Close() - - return NewClassifierWithReader(fl) -} - -// create and initialize the classifier from a file data -func NewClassifierFromFileData(data []byte) (*Classifier, error) { - return NewClassifierWithReader(bytes.NewReader(data)) -} - -// create and initialize the classifier from a file data -func NewClassifierWithReader(reader io.Reader) (*Classifier, error) { - classifier := &Classifier{} - err := gob.NewDecoder(reader).Decode(classifier) - if err != nil { - return classifier, err - } - - return classifier, nil -} - -// save the classifier to a file -// func (c *Classifier) SaveClassifierToFile(path string) error { -// fl, err := os.Create(path) -// if err != nil { -// return err -// } -// defer fl.Close() - -// err = gob.NewEncoder(fl).Encode(&c) -// if err != nil { -// return err -// } - -// return nil -// } - -// Train the classifier -// func (c *Classifier) Train(category string, document string) { -// for word, count := range countWords(document) { -// c.Words[category][word] += count -// c.CategoriesWords[category] += count -// c.TotalWords += count -// } -// c.CategoriesDocuments[category]++ -// c.TotalDocuments++ -// } - -// Classify a document -func (c *Classifier) Classify(document string) (category string) { - // get all the probabilities of each category - prob := c.Probabilities(document) - - // sort the categories according to probabilities - var sp []Sorted - for c, p := range prob { - sp = append(sp, Sorted{c, p}) - } - sort.Slice(sp, func(i, j int) bool { - return sp[i].Probability > sp[j].Probability - }) - - // if the highest probability is above threshold select that - if sp[0].Probability/sp[1].Probability > c.Threshold { - category = sp[0].Category - } else { - category = "other" - } - - return -} - -// Probabilities of each category -func (c *Classifier) Probabilities(document string) (p map[string]float64) { - p = make(map[string]float64) - for category := range c.Words { - p[category] = c.pCategoryDocument(category, document) - } - return -} - -// p (document | category) -func (c *Classifier) pDocumentCategory(category string, document string) (p float64) { - p = 1.0 - for word := range countWords(document) { - p = p * c.pWordCategory(category, word) - } - return p -} - -func (c *Classifier) pWordCategory(category string, word string) float64 { - return float64(c.Words[category][stem(word)]+1) / float64(c.CategoriesWords[category]) -} - -// p (category) -func (c *Classifier) pCategory(category string) float64 { - return float64(c.CategoriesDocuments[category]) / float64(c.TotalDocuments) -} - -// p (category | document) -func (c *Classifier) pCategoryDocument(category string, document string) float64 { - return c.pDocumentCategory(category, document) * c.pCategory(category) -} - -// clean up and split words in document, then stem each word and count the occurrence -func countWords(document string) (wordCount map[string]int) { - cleaned := cleanDocument(document) - words := strings.Split(cleaned, " ") - wordCount = make(map[string]int) - for _, word := range words { - if _, ok := stopWords[word]; !ok { - key := stem(strings.ToLower(word)) - wordCount[key]++ - } - } - return -} - -func cleanDocument(text string) string { - return cleaner.ReplaceAllString(text, "") -} - -// stem a word using the Snowball algorithm -func stem(word string) string { - stemmed, err := snowball.Stem(word, "english", true) - if err == nil { - return stemmed - } - // fmt.Println("Cannot stem word:", word) - return word -} diff --git a/common/errorpageclassifier/cm.go b/common/errorpageclassifier/cm.go deleted file mode 100644 index 3ba6962c..00000000 --- a/common/errorpageclassifier/cm.go +++ /dev/null @@ -1,105 +0,0 @@ -package errorpageclassifier - -// import "fmt" - -// type ConfusionMatrix struct { -// matrix [][]int -// labels []string -// } - -// func NewConfusionMatrix(actual, predicted []string, labels []string) *ConfusionMatrix { -// n := len(labels) -// matrix := make([][]int, n) -// for i := range matrix { -// matrix[i] = make([]int, n) -// } - -// labelIndices := make(map[string]int) -// for i, label := range labels { -// labelIndices[label] = i -// } - -// for i := range actual { -// matrix[labelIndices[actual[i]]][labelIndices[predicted[i]]]++ -// } - -// return &ConfusionMatrix{ -// matrix: matrix, -// labels: labels, -// } -// } - -// func (cm *ConfusionMatrix) PrintConfusionMatrix() { -// fmt.Printf("%30s\n", "Confusion Matrix") -// fmt.Println() -// // Print header -// fmt.Printf("%-15s", "") -// for _, label := range cm.labels { -// fmt.Printf("%-15s", label) -// } -// fmt.Println() - -// // Print rows -// for i, row := range cm.matrix { -// fmt.Printf("%-15s", cm.labels[i]) -// for _, value := range row { -// fmt.Printf("%-15d", value) -// } -// fmt.Println() -// } -// fmt.Println() -// } - -// func (cm *ConfusionMatrix) PrintClassificationReport() { -// fmt.Printf("%30s\n", "Classification Report") -// fmt.Println() - -// fmt.Printf("\n%-15s %-10s %-10s %-10s %-10s\n", "", "precision", "recall", "f1-score", "support") - -// totals := map[string]float64{"true": 0, "predicted": 0, "correct": 0} -// macroAvg := map[string]float64{"precision": 0, "recall": 0, "f1-score": 0} - -// for i, label := range cm.labels { -// truePos := cm.matrix[i][i] -// falsePos, falseNeg := 0, 0 -// for j := 0; j < len(cm.labels); j++ { -// if i != j { -// falsePos += cm.matrix[j][i] -// falseNeg += cm.matrix[i][j] -// } -// } - -// precision := float64(truePos) / float64(truePos+falsePos) -// recall := float64(truePos) / float64(truePos+falseNeg) -// f1Score := 2 * precision * recall / (precision + recall) -// support := truePos + falseNeg - -// fmt.Printf("%-15s %-10.2f %-10.2f %-10.2f %-10d\n", label, precision, recall, f1Score, support) - -// totals["true"] += float64(support) -// totals["predicted"] += float64(truePos + falsePos) -// totals["correct"] += float64(truePos) - -// macroAvg["precision"] += precision -// macroAvg["recall"] += recall -// macroAvg["f1-score"] += f1Score -// } - -// accuracy := totals["correct"] / totals["true"] -// fmt.Printf("\n%-26s %-10s %-10.2f %-10d", "accuracy", "", accuracy, int(totals["true"])) - -// fmt.Printf("\n%-15s %-10.2f %-10.2f %-10.2f %-10d\n", "macro avg", -// macroAvg["precision"]/float64(len(cm.labels)), -// macroAvg["recall"]/float64(len(cm.labels)), -// macroAvg["f1-score"]/float64(len(cm.labels)), -// int(totals["true"])) - -// precisionWeightedAvg := totals["correct"] / totals["predicted"] -// recallWeightedAvg := totals["correct"] / totals["true"] -// f1ScoreWeightedAvg := 2 * precisionWeightedAvg * recallWeightedAvg / (precisionWeightedAvg + recallWeightedAvg) - -// fmt.Printf("%-15s %-10.2f %-10.2f %-10.2f %-10d\n", "weighted avg", -// precisionWeightedAvg, recallWeightedAvg, f1ScoreWeightedAvg, int(totals["true"])) - -// fmt.Println() -// } diff --git a/common/errorpageclassifier/errorpageclassifier.go b/common/errorpageclassifier/errorpageclassifier.go index 76e5c341..d916d7ce 100644 --- a/common/errorpageclassifier/errorpageclassifier.go +++ b/common/errorpageclassifier/errorpageclassifier.go @@ -4,33 +4,18 @@ import ( _ "embed" "github.com/jaytaylor/html2text" + "github.com/projectdiscovery/utils/ml/naive_bayes" ) -// const ( -// modelPath = "clf.gob" -// threshold = 1.1 -// testPercentage = 0.2 -// ) - -// var categories = []string{"error", "nonerror"} - -type Document struct { - Class string - Text string -} - -// go:embed dataset.txt -// var dataset string - //go:embed clf.gob var classifierData []byte type ErrorPageClassifier struct { - classifier *Classifier + classifier *naive_bayes.NaiveBayesClassifier } func New() *ErrorPageClassifier { - classifier, err := NewClassifierFromFileData(classifierData) + classifier, err := naive_bayes.NewClassifierFromFileData(classifierData) if err != nil { panic(err) } @@ -38,96 +23,17 @@ func New() *ErrorPageClassifier { } func (n *ErrorPageClassifier) Classify(html string) string { - text, err := htmlToText(html) - if err != nil { - panic(err) - } - + text := htmlToText(html) if text == "" { return "other" } - cls := n.classifier.Classify(text) - return cls + return n.classifier.Classify(text) } -// func (epc *ErrorPageClassifier) Evaluate() { -// train, test := trainTestSplit() -// fmt.Println("no of docs in TRAIN dataset:", len(train)) -// fmt.Println("no of docs in TEST dataset:", len(test)) - -// fmt.Println("Evaluating classifier on test set:") -// actualTest, predictedTest := epc.testClf(test) -// confusionMatrixTest := NewConfusionMatrix(actualTest, predictedTest, []string{"error", "nonerror"}) -// confusionMatrixTest.PrintConfusionMatrix() -// confusionMatrixTest.PrintClassificationReport() - -// fmt.Println("Evaluating classifier on the first 100 docs in the train set:") -// actualValidate, predictedValidate := epc.validateClf(train[0:100]) -// confusionMatrixValidate := NewConfusionMatrix(actualValidate, predictedValidate, []string{"error", "nonerror"}) -// confusionMatrixValidate.PrintConfusionMatrix() -// confusionMatrixValidate.PrintClassificationReport() -// } - -// func (epc *ErrorPageClassifier) testClf(test []Document) ([]string, []string) { -// actual := []string{} -// predicted := []string{} - -// for _, doc := range test { -// class := epc.classifier.Classify(doc.Text) -// actual = append(actual, doc.Class) -// predicted = append(predicted, class) -// } -// return actual, predicted -// } - -// func (epc *ErrorPageClassifier) validateClf(validation []Document) ([]string, []string) { -// actual := []string{} -// predicted := []string{} - -// for _, doc := range validation { -// actual = append(actual, doc.Class) -// sentiment := epc.classifier.Classify(doc.Text) -// predicted = append(predicted, sentiment) -// } -// return actual, predicted -// } - -// func TrainAndSave() { -// train, test := trainTestSplit() -// clf := NewClassifier(categories, threshold) - -// fmt.Println("no of docs in TRAIN dataset:", len(train)) -// fmt.Println("no of docs in TEST dataset:", len(test)) - -// for _, doc := range train { -// clf.Train(doc.Class, doc.Text) -// } - -// err := clf.SaveClassifierToFile(modelPath) -// if err != nil { -// panic(err) -// } -// } - -// func trainTestSplit() (train, test []Document) { -// data := strings.Split(dataset, "\n") -// for _, line := range data { -// s := strings.Split(line, "||") -// doc, sentiment := s[0], s[1] - -// if rand.Float64() > testPercentage { -// train = append(train, Document{sentiment, doc}) -// } else { -// test = append(test, Document{sentiment, doc}) -// } -// } -// return train, test -// } - -func htmlToText(html string) (string, error) { +func htmlToText(html string) string { text, err := html2text.FromString(html, html2text.Options{TextOnly: true}) if err != nil { - return "", err + panic(err) } - return text, nil + return text } diff --git a/go.mod b/go.mod index 43ebcc38..12b96aa9 100644 --- a/go.mod +++ b/go.mod @@ -17,7 +17,7 @@ require ( github.com/projectdiscovery/fdmax v0.0.4 github.com/projectdiscovery/goconfig v0.0.1 github.com/projectdiscovery/goflags v0.1.11 - github.com/projectdiscovery/gologger v1.1.10 + github.com/projectdiscovery/gologger v1.1.11 github.com/projectdiscovery/hmap v0.0.13 github.com/projectdiscovery/mapcidr v1.1.2 github.com/projectdiscovery/rawhttp v0.1.15 @@ -26,8 +26,8 @@ require ( github.com/remeh/sizedwaitgroup v1.0.0 github.com/rs/xid v1.5.0 go.etcd.io/bbolt v1.3.7 // indirect - golang.org/x/net v0.11.0 - golang.org/x/sys v0.9.0 // indirect + golang.org/x/net v0.12.0 + golang.org/x/sys v0.10.0 // indirect golang.org/x/text v0.11.0 ) @@ -40,7 +40,6 @@ require ( github.com/go-rod/rod v0.113.3 github.com/hdm/jarm-go v0.0.7 github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 - github.com/kljensen/snowball v0.8.0 github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6 github.com/mitchellh/mapstructure v1.5.0 github.com/projectdiscovery/asnmap v1.0.4 @@ -48,7 +47,7 @@ require ( github.com/projectdiscovery/fastdialer v0.0.32 github.com/projectdiscovery/ratelimit v0.0.9 github.com/projectdiscovery/tlsx v1.1.0 - github.com/projectdiscovery/utils v0.0.40-0.20230627061640-8ec2b35f851c + github.com/projectdiscovery/utils v0.0.42 github.com/stretchr/testify v1.8.4 github.com/zmap/zcrypto v0.0.0-20230205235340-d51ce4775101 go.uber.org/multierr v1.11.0 @@ -88,6 +87,7 @@ require ( github.com/json-iterator/go v1.1.12 // indirect github.com/kataras/jwt v0.1.8 // indirect github.com/klauspost/compress v1.15.15 // indirect + github.com/kljensen/snowball v0.8.0 // indirect github.com/lucasb-eyer/go-colorful v1.2.0 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect @@ -112,7 +112,7 @@ require ( github.com/rivo/uniseg v0.4.4 // indirect github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect github.com/sashabaranov/go-openai v1.12.0 // indirect - github.com/shirou/gopsutil/v3 v3.23.5 // indirect + github.com/shirou/gopsutil/v3 v3.23.6 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect github.com/syndtr/goleveldb v1.0.0 // indirect @@ -140,12 +140,12 @@ require ( github.com/yuin/goldmark-emoji v1.0.1 // indirect github.com/yusufpapurcu/wmi v1.2.3 // indirect github.com/zmap/rc2 v0.0.0-20190804163417-abaa70531248 // indirect - golang.org/x/crypto v0.10.0 // indirect + golang.org/x/crypto v0.11.0 // indirect golang.org/x/mod v0.10.0 // indirect - golang.org/x/oauth2 v0.9.0 // indirect + golang.org/x/oauth2 v0.10.0 // indirect golang.org/x/tools v0.8.0 // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/protobuf v1.29.1 // indirect + google.golang.org/protobuf v1.31.0 // indirect gopkg.in/djherbis/times.v1 v1.3.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index e23d40fb..dcf9d785 100644 --- a/go.sum +++ b/go.sum @@ -203,8 +203,8 @@ github.com/projectdiscovery/goconfig v0.0.1 h1:36m3QjohZvemqh9bkJAakaHsm9iEZ2AcQ github.com/projectdiscovery/goconfig v0.0.1/go.mod h1:CPO25zR+mzTtyBrsygqsHse0sp/4vB/PjaHi9upXlDw= github.com/projectdiscovery/goflags v0.1.11 h1:C4UTO3SM5Vfy1J2sdhukm7wONW/tljMpUMNKue5ie00= github.com/projectdiscovery/goflags v0.1.11/go.mod h1:wC5uJonjddDcCqDNfPq+03nRessSB/LLaaIea4w47ws= -github.com/projectdiscovery/gologger v1.1.10 h1:XNRdtzLTdxiFGuK9gutoL752mykzXDoii4P2yDovqck= -github.com/projectdiscovery/gologger v1.1.10/go.mod h1:VqANHK7qcEq3i6/vV5HNWwdyv2aFPSrlaVDU4Ogrc6U= +github.com/projectdiscovery/gologger v1.1.11 h1:8vsz9oJlDT9euw6xlj7F7dZ6RWItVIqVwn4Mr6uzky8= +github.com/projectdiscovery/gologger v1.1.11/go.mod h1:UR2bgXl7zraOxYGnUwuO917hifWrwMJ0feKnVqMQkzY= github.com/projectdiscovery/hmap v0.0.13 h1:8v5j99Pz0S7V1YrTeWp7xtr1yNOffKQ/KusHZfB+mrI= github.com/projectdiscovery/hmap v0.0.13/go.mod h1:Ymc9xjbfhswpmI/gOx5hyR4+OvqguSq1SDJTH197gWg= github.com/projectdiscovery/mapcidr v1.1.2 h1:Mmq/nPqvVc7fjvH/kJVK0IBOny/LrJIxZ4tQsLPCrsA= @@ -222,8 +222,8 @@ github.com/projectdiscovery/retryablehttp-go v1.0.18/go.mod h1:oE3dmYWMadFWzaIfG github.com/projectdiscovery/stringsutil v0.0.2 h1:uzmw3IVLJSMW1kEg8eCStG/cGbYYZAja8BH3LqqJXMA= github.com/projectdiscovery/tlsx v1.1.0 h1:6L5VKpHaoqvIHN6lH9zi7jIvph1JwYMYZOIpWBJBG6I= github.com/projectdiscovery/tlsx v1.1.0/go.mod h1:C9xTbU2t54Anmvuq+4jxevR5rzqpp6XUUtV7G9J5CTE= -github.com/projectdiscovery/utils v0.0.40-0.20230627061640-8ec2b35f851c h1:mNV/VSMi9wVpq3gcz4km2oUml9M+La20GaFoJPe3Ils= -github.com/projectdiscovery/utils v0.0.40-0.20230627061640-8ec2b35f851c/go.mod h1:rrd8dTBuKEScNMLgs1Xiu8rPCVeR0QTzmRcQ5iM3ymo= +github.com/projectdiscovery/utils v0.0.42 h1:NK506tyhI3vGH5Z6S69VTa1U/Y+VFY6vy0opg69Xy7Q= +github.com/projectdiscovery/utils v0.0.42/go.mod h1:zlRoARdARkoSa0rkoyDFPxbJ4QlqPfJnoo5pih+/FYc= github.com/projectdiscovery/wappalyzergo v0.0.102 h1:ABjZghof2U2yzGNL+q5ouWHEardLd2o53Ukgrf8CZzE= github.com/projectdiscovery/wappalyzergo v0.0.102/go.mod h1:4Z3DKhi75zIPMuA+qSDDWxZvnhL4qTLmDx4dxNMu7MA= github.com/refraction-networking/utls v1.3.2 h1:o+AkWB57mkcoW36ET7uJ002CpBWHu0KPxi6vzxvPnv8= @@ -241,8 +241,8 @@ github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7 github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/sashabaranov/go-openai v1.12.0 h1:aRNHH0gtVfrpIaEolD0sWrLLRnYQNK4cH/bIAHwL8Rk= github.com/sashabaranov/go-openai v1.12.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= -github.com/shirou/gopsutil/v3 v3.23.5 h1:5SgDCeQ0KW0S4N0znjeM/eFHXXOKyv2dVNgRq/c9P6Y= -github.com/shirou/gopsutil/v3 v3.23.5/go.mod h1:Ng3Maa27Q2KARVJ0SPZF5NdrQSC3XHKP8IIWrHgMeLY= +github.com/shirou/gopsutil/v3 v3.23.6 h1:5y46WPI9QBKBbK7EEccUPNXpJpNrvPuTD0O2zHEHT08= +github.com/shirou/gopsutil/v3 v3.23.6/go.mod h1:j7QX50DrXYggrpN30W0Mo+I4/8U2UUIQrnrhqUeWrAU= github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= @@ -263,7 +263,6 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE= @@ -350,8 +349,8 @@ golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20211209193657-4570a0811e8b/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.6.0/go.mod h1:OFC/31mSvZgRz0V1QTNCzfAI1aIRzbiufJtkMIlEp58= -golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM= -golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I= +golang.org/x/crypto v0.11.0 h1:6Ewdq3tDic1mg5xRO4milcWCfMVQhI4NkqWWvqejpuA= +golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio= golang.org/x/exp v0.0.0-20230420155640-133eef4313cb h1:rhjz/8Mbfa8xROFiH+MQphmAmgqRM0bOMnytznhWEXk= golang.org/x/exp v0.0.0-20230420155640-133eef4313cb/go.mod h1:V1LtkGg67GoY2N1AnLN78QLrzxkLyJw7RJb1gzOOz9w= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= @@ -377,12 +376,12 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.11.0 h1:Gi2tvZIJyBtO9SDr1q9h5hEQCp/4L2RQ+ar0qjx2oNU= -golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ= +golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50= +golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.5.0/go.mod h1:9/XBHVqLaWO3/BRHs5jbpYCnOZVjj5V0ndyaAM7KB4I= -golang.org/x/oauth2 v0.9.0 h1:BPpt2kU7oMRq3kCHAA1tbSEshXRw1LpG2ztgDwrzuAs= -golang.org/x/oauth2 v0.9.0/go.mod h1:qYgFZaFiu6Wg24azG8bdV52QJXJGbZzIIsRCdVKzbLw= +golang.org/x/oauth2 v0.10.0 h1:zHCpF2Khkwy4mMB4bv0U37YtJdTGW8jI0glAApi0Kh8= +golang.org/x/oauth2 v0.10.0/go.mod h1:kTpgurOux7LqtuxjuyZa4Gj2gdezIt/jQtGnNFfypQI= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -412,8 +411,9 @@ golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s= golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.10.0 h1:SqMFp9UcQJZa+pmYuAKjd9xq1f0j5rLcDIk0mj4qAsA= +golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -447,8 +447,8 @@ google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp0 google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -google.golang.org/protobuf v1.29.1 h1:7QBf+IK2gx70Ap/hDsOmam3GE0v9HicjfEdAxE62UoM= -google.golang.org/protobuf v1.29.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= From 1e7aff9dfdc7dff756e0e967c86047b0a027a51e Mon Sep 17 00:00:00 2001 From: sandeep <8293321+ehsandeep@users.noreply.github.com> Date: Mon, 17 Jul 2023 16:46:01 +0530 Subject: [PATCH 09/11] Merge branch 'dev' into pr/1245 --- resume.cfg | 2 ++ runner/banner.go | 5 ++--- 2 files changed, 4 insertions(+), 3 deletions(-) create mode 100644 resume.cfg diff --git a/resume.cfg b/resume.cfg new file mode 100644 index 00000000..b84a48b6 --- /dev/null +++ b/resume.cfg @@ -0,0 +1,2 @@ +index=21 +resume_from=www.hackerone.com diff --git a/runner/banner.go b/runner/banner.go index 3b26ef1d..a4697c03 100644 --- a/runner/banner.go +++ b/runner/banner.go @@ -1,11 +1,10 @@ package runner import ( - "github.com/projectdiscovery/gologger" + "github.com/projectdiscovery/gologger" updateutils "github.com/projectdiscovery/utils/update" ) - const banner = ` __ __ __ _ __ / /_ / /_/ /_____ | |/ / @@ -30,4 +29,4 @@ func GetUpdateCallback() func() { showBanner() updateutils.GetUpdateToolCallback("httpx", version)() } -} \ No newline at end of file +} From e90a4346fa360670928b825ce80fffcd326524db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Mon, 17 Jul 2023 12:37:13 +0000 Subject: [PATCH 10/11] update example --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fe8b4852..d88294b1 100644 --- a/README.md +++ b/README.md @@ -354,7 +354,7 @@ https://resources.hackerone.com [301,301,404] [Sorry, no Folders found.] The Error Page Classifier and Filtering feature aims to add intelligence to the tool by enabling it to classify and filter out common error pages returned by web applications. It is an enhancement to the existing httpx capabilities and is geared towards reducing the noise in the results and helping users focus on what matters most. ```console -httpx -l list.txt -fep +httpx -l urls.txt -path /v1/api -fep __ __ __ _ __ / /_ / /_/ /_____ | |/ / @@ -365,9 +365,8 @@ httpx -l list.txt -fep projectdiscovery.io -[INF] Current httpx version v1.3.2 (latest) -https://projectdiscovery.io -https://scanme.sh +[INF] Current httpx version v1.3.3 (latest) +https://scanme.sh/v1/api ``` ### Favicon Hash From 1c15cc35952649db4fd13c176b7da122b5cc1048 Mon Sep 17 00:00:00 2001 From: sandeep <8293321+ehsandeep@users.noreply.github.com> Date: Tue, 18 Jul 2023 01:41:26 +0530 Subject: [PATCH 11/11] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index d88294b1..5aa66357 100644 --- a/README.md +++ b/README.md @@ -351,6 +351,7 @@ https://resources.hackerone.com [301,301,404] [Sorry, no Folders found.] ``` ### Error Page Classifier and Filtering + The Error Page Classifier and Filtering feature aims to add intelligence to the tool by enabling it to classify and filter out common error pages returned by web applications. It is an enhancement to the existing httpx capabilities and is geared towards reducing the noise in the results and helping users focus on what matters most. ```console @@ -369,6 +370,8 @@ httpx -l urls.txt -path /v1/api -fep https://scanme.sh/v1/api ``` +Filtered error pages are stored to predefined file `filtered_error_page.json` in jsonline format when `-filter-error-page` option is used. + ### Favicon Hash