Skip to content

Commit a22c9c8

Browse files
committed
feat: scrapping form threads
1 parent 63cbd3a commit a22c9c8

File tree

10 files changed

+3847
-1
lines changed

10 files changed

+3847
-1
lines changed

app/exposer/exposer_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ func FixtureTestWebServer() {
3030

3131
func testQuery(url string) (string, error) {
3232
resp, err := http.Get("http://localhost:8080" + url)
33-
logus.CheckFatal(err, "query failed")
33+
logus.CheckError(err, "query failed")
3434
defer resp.Body.Close()
3535
body, err := io.ReadAll(resp.Body)
3636
return string(body), err

app/scrappy/forum/detailed_post.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
package forum
2+
3+
import (
4+
"darkbot/app/settings/logus"
5+
"fmt"
6+
"net/url"
7+
"strings"
8+
9+
"github.com/anaskhan96/soup"
10+
)
11+
12+
type DetailedPost struct {
13+
*LatestThread
14+
PostID PostID
15+
PostContent PostContent
16+
PostPermamentLink PostPermamentLink
17+
18+
requester func(MethodType, Url) (*QueryResult, error)
19+
}
20+
type PostID string
21+
type PostContent string
22+
type PostPermamentLink Url
23+
24+
type detailedPostParam func(detailedPost *DetailedPost)
25+
26+
func WithMockedRequester(
27+
requester func(MethodType, Url) (*QueryResult, error),
28+
) detailedPostParam {
29+
return func(detailedPost *DetailedPost) {
30+
detailedPost.requester = requester
31+
}
32+
}
33+
34+
func NewDetailedPost(thread *LatestThread, opts ...detailedPostParam) (*DetailedPost, error) {
35+
detailed_post := &DetailedPost{requester: NewQuery}
36+
for _, opt := range opts {
37+
opt(detailed_post)
38+
}
39+
40+
query, err := detailed_post.requester(GET, thread.ThreadLink.GetUrl())
41+
if logus.CheckError(err, "failed to query ThreadLink.GetUrl()="+string(thread.ThreadLink)) {
42+
return nil, err
43+
}
44+
45+
doc := soup.HTMLParse(query.GetContent())
46+
params, _ := url.ParseQuery(query.ResponseRawQuery)
47+
post_id := PostID(params["pid"][0])
48+
49+
forum := doc.Find("div", "id", "forum")
50+
if logus.CheckError(forum.Error, "failed to get forum object") {
51+
return nil, forum.Error
52+
}
53+
thread_header := forum.Find("td", "class", "thead")
54+
if logus.CheckError(thread_header.Error, "failed to get thread_header object") {
55+
return nil, thread_header.Error
56+
}
57+
thread_name := thread_header.FullText()
58+
thread_name = strings.ReplaceAll(thread_name, "\n", "")
59+
thread_name = strings.ReplaceAll(thread_name, "\t", "")
60+
logus.Debug("thread_name=" + thread_name)
61+
62+
post := doc.Find("table", "id", fmt.Sprintf("post_%s", string(post_id)))
63+
if logus.CheckError(post.Error, "failed to get post object") {
64+
return nil, post.Error
65+
}
66+
post_body := post.Find("div", "class", "post_body")
67+
if logus.CheckError(post_body.Error, "failed to get post_body object") {
68+
return nil, post_body.Error
69+
}
70+
post_content := post_body.FullText()
71+
72+
post_content = strings.ReplaceAll(post_content, "\t", "")
73+
for i := 0; i < 5; i++ {
74+
post_content = strings.ReplaceAll(post_content, "\n\n", "\n")
75+
}
76+
77+
detailed_post.LatestThread = thread
78+
detailed_post.PostID = post_id
79+
detailed_post.PostContent = PostContent(post_content)
80+
detailed_post.PostPermamentLink = PostPermamentLink(query.ResponseFullUrl)
81+
82+
return detailed_post, nil
83+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package forum
2+
3+
import (
4+
"darkbot/app/settings/logus"
5+
"darkbot/app/settings/utils"
6+
"fmt"
7+
"os"
8+
"path/filepath"
9+
"testing"
10+
11+
"github.com/stretchr/testify/assert"
12+
)
13+
14+
func FixtureLatestThread() *LatestThread {
15+
return &LatestThread{
16+
ThreadLink: "https://discoverygc.com/forums/showthread.php?tid=200175&action=lastpost",
17+
ThreadName: "To: NNroute.../(BDM-Direk...",
18+
LastUpdated: "11-20-2023, 09:35 AM",
19+
PostAuthorLink: "https://discoverygc.com/forums/member.php?action=profile&uid=54754",
20+
PostAuthorName: "Civil Servant",
21+
}
22+
}
23+
24+
func TestGetDetailedPost(t *testing.T) {
25+
thread := FixtureLatestThread()
26+
27+
detailed_post_content_filepath := filepath.Join(utils.GetCurrrentFolder(), "test_data", "detailed_post_content.html")
28+
if _, err := os.Stat(detailed_post_content_filepath); err != nil {
29+
query, err := NewQuery("GET", "https://discoverygc.com/forums/showthread.php?tid=200175&action=lastpost")
30+
logus.CheckFatal(err, "failed to create mock data")
31+
os.WriteFile(detailed_post_content_filepath, []byte(query.GetContent()), 0644)
32+
}
33+
detailed_post_content, _ := os.ReadFile(detailed_post_content_filepath)
34+
mocked_requester := func(mt MethodType, u Url) (*QueryResult, error) {
35+
return &QueryResult{
36+
content: string(detailed_post_content),
37+
ResponseRawQuery: `tid=200175&pid=2315295`,
38+
ResponseFullUrl: `https://discoverygc.com/forums/showthread.php?tid=200175&pid=2315295#pid2315295`,
39+
}, nil
40+
}
41+
detailed_post, err := NewDetailedPost(thread, WithMockedRequester(mocked_requester))
42+
_ = detailed_post
43+
fmt.Println("err=", err)
44+
assert.Nil(t, err, "expected error to be nil")
45+
}
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
package forum
2+
3+
import (
4+
"darkbot/app/settings/logus"
5+
"fmt"
6+
"net/url"
7+
8+
"github.com/anaskhan96/soup"
9+
)
10+
11+
type Url string
12+
13+
type ThreadLink Url
14+
15+
func (u ThreadLink) GetUrl() Url { return Url(u) }
16+
17+
type ThreadShortName string
18+
type ThreadID string
19+
type ForumTimestamp string
20+
21+
type PostAuthorLink Url
22+
23+
func (u PostAuthorLink) GetUrl() Url { return Url(u) }
24+
25+
type PostAuthorName string
26+
27+
type LatestThread struct {
28+
ThreadLink ThreadLink
29+
ThreadName ThreadShortName
30+
ThreadID ThreadID
31+
LastUpdated ForumTimestamp
32+
PostAuthorLink PostAuthorLink
33+
PostAuthorName PostAuthorName
34+
}
35+
36+
type ThreadsPage struct {
37+
Threads []LatestThread
38+
requester func(MethodType, Url) (*QueryResult, error)
39+
}
40+
41+
type threadPageParam func(thread_page *ThreadsPage)
42+
43+
func WithMockedPageRequester(
44+
requester func(MethodType, Url) (*QueryResult, error),
45+
) threadPageParam {
46+
return func(thread_page *ThreadsPage) {
47+
thread_page.requester = requester
48+
}
49+
}
50+
51+
const ThreadPageURL Url = "https://discoverygc.com/forums/portal.php"
52+
53+
func GetLatestThreads(opts ...threadPageParam) (*ThreadsPage, error) {
54+
thread_page := &ThreadsPage{
55+
requester: NewQuery,
56+
}
57+
for _, opt := range opts {
58+
opt(thread_page)
59+
}
60+
61+
query, err := thread_page.requester(GET, ThreadPageURL)
62+
if logus.CheckError(err, "Failed to make query") {
63+
return nil, err
64+
}
65+
66+
content := query.GetContent()
67+
doc := soup.HTMLParse(content)
68+
forum_posts := doc.FindAll("tr", "class", "latestthreads_portal")
69+
70+
for _, forum_post := range forum_posts {
71+
thread := forum_post.Find("strong").Find("a")
72+
if logus.CheckError(thread.Error, "failed to get thread object") {
73+
return nil, thread.Error
74+
}
75+
76+
thread_link := thread.Attrs()["href"]
77+
thread_name := thread.Text()
78+
span_section := forum_post.Find("span")
79+
if logus.CheckError(span_section.Error, "failed to get span_section object") {
80+
return nil, span_section.Error
81+
}
82+
83+
forum_timestamp := span_section.Find("span").Attrs()["title"]
84+
85+
author := span_section.Find("a")
86+
if logus.CheckError(author.Error, "failed to get author object") {
87+
return nil, author.Error
88+
}
89+
author_link := author.Attrs()["href"]
90+
author_name := author.Text()
91+
92+
myUrl, _ := url.Parse(thread_link)
93+
params, _ := url.ParseQuery(myUrl.RawQuery)
94+
95+
latest_thread := LatestThread{
96+
ThreadLink: ThreadLink(thread_link),
97+
ThreadName: ThreadShortName(thread_name),
98+
LastUpdated: ForumTimestamp(forum_timestamp),
99+
PostAuthorLink: PostAuthorLink(author_link),
100+
PostAuthorName: PostAuthorName(author_name),
101+
ThreadID: ThreadID(params["tid"][0]),
102+
}
103+
thread_page.Threads = append(thread_page.Threads, latest_thread)
104+
105+
logus.Debug(fmt.Sprintf("latest_thread=%v", latest_thread))
106+
}
107+
return thread_page, nil
108+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package forum
2+
3+
import (
4+
"darkbot/app/settings/logus"
5+
"darkbot/app/settings/utils"
6+
"os"
7+
"path/filepath"
8+
"testing"
9+
10+
"github.com/stretchr/testify/assert"
11+
)
12+
13+
func TestGetPosts(t *testing.T) {
14+
thread_post_content_filepath := filepath.Join(utils.GetCurrrentFolder(), "test_data", "latest_threads.html")
15+
if _, err := os.Stat(thread_post_content_filepath); err != nil {
16+
query, err := NewQuery("GET", ThreadPageURL)
17+
logus.CheckFatal(err, "failed to create mock data")
18+
os.WriteFile(thread_post_content_filepath, []byte(query.GetContent()), 0644)
19+
}
20+
thread_post_content, _ := os.ReadFile(thread_post_content_filepath)
21+
mocked_requester := func(mt MethodType, u Url) (*QueryResult, error) {
22+
return &QueryResult{
23+
content: string(thread_post_content),
24+
ResponseRawQuery: ``,
25+
ResponseFullUrl: `https://discoverygc.com/forums/portal.php`,
26+
}, nil
27+
}
28+
29+
thread_page, err := GetLatestThreads(WithMockedPageRequester(mocked_requester))
30+
assert.Nil(t, err, "expected nil as error")
31+
assert.Greater(t, len(thread_page.Threads), 0)
32+
}

app/scrappy/forum/query.go

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
package forum
2+
3+
import (
4+
"darkbot/app/settings/logus"
5+
"io"
6+
"net/http"
7+
8+
"golang.org/x/net/html/charset"
9+
)
10+
11+
type MethodType string
12+
13+
const (
14+
GET MethodType = "GET"
15+
)
16+
17+
type QueryResult struct {
18+
content string
19+
ResponseRawQuery string
20+
ResponseFullUrl string
21+
}
22+
23+
func (q *QueryResult) GetContent() string {
24+
return q.content
25+
}
26+
27+
func NewQuery(method_type MethodType, url Url) (*QueryResult, error) {
28+
client := &http.Client{}
29+
req, err := http.NewRequest(string(method_type), string(url), nil)
30+
if logus.CheckWarn(err, "Failed to create request") {
31+
return nil, err
32+
}
33+
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")
34+
// req.Header.Set("User-Agent", "curl/7.81.0")
35+
req.Header.Set("ACCEPT", "*/*")
36+
req.Header.Set("CONTENT-LENGTH", "")
37+
req.Header.Set("CONTENT-TYPE", "")
38+
39+
resp, err := client.Do(req)
40+
if logus.CheckWarn(err, "Failed to make query") {
41+
return nil, err
42+
}
43+
44+
defer resp.Body.Close()
45+
46+
if resp.StatusCode >= 300 && resp.StatusCode <= 399 {
47+
logus.Debug("this request is redirecting!")
48+
redirectUrl, err := resp.Location()
49+
if logus.CheckError(err, "Error getting redirect location") {
50+
return nil, err
51+
}
52+
53+
req.URL = redirectUrl
54+
resp, err = client.Do(req)
55+
if logus.CheckError(err, "Error sending redirect request:") {
56+
return nil, err
57+
}
58+
59+
}
60+
61+
utf8Body, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
62+
if err != nil {
63+
return nil, err
64+
}
65+
bytes, err := io.ReadAll(utf8Body)
66+
67+
return &QueryResult{
68+
content: string(bytes),
69+
ResponseRawQuery: resp.Request.URL.RawQuery,
70+
ResponseFullUrl: resp.Request.URL.String(),
71+
}, err
72+
}

0 commit comments

Comments
 (0)