网站首页 > 技术文章正文

小红书笔记拉取AI改写的抛砖引玉（小红书笔记替换后新的能收录吗）

nanyue 2025-07-23 17:09:37 技术文章 65 ℃

首先声明：本文章仅限技术研究，不可用于任何违法犯罪！

1、搜索页面关键信息查找

小红书搜索网址：
https://edith.xiaohongshu.com/api/sns/web/v1/search/notes

需要两个参数：一个是cookie，还有一个是搜索关键字

需要登录小红书后方能取得cookie，如何取得cookie，请自行baidu

调用接口的json串

{
  "keyword": "搜索关键字",
  "page": 1,
  "page_size": 20,
  "search_id": "2f0e292nup5mqrk8rzhev",
  "sort": "general",
  "note_type": 0,
  "ext_flags": [],
  "filters": [
    {"tags": ["general"], "type": "sort_type"},
    {"tags": ["不限"], "type": "filter_note_type"},
    {"tags": ["不限"], "type": "filter_note_time"},
    {"tags": ["不限"], "type": "filter_note_range"},
    {"tags": ["不限"], "type": "filter_pos_distance"}
  ],
  "geo": "",
  "image_formats": ["jpg", "webp", "avif"]
}

请求后可以取得每一条的itemId和token

2、打开每个详细页面取得相关信息

将取得的itemId和token拼成字符串请求#34;
https://www.xiaohongshu.com/explore/{itemId}?xsec_token={token}&xsec_source=pc_search";

cookie也必须带上哦！

请求后取得的responseBody，非常大，再用正值表达式取出核心信息

            var result = new Dictionary<string, object>
            {
                { "noteId", "" },               //笔记ID
                { "userId", "" },               //当前登录用户ID
                { "user_nickname", "" },        //当前登录用户昵称
                { "og_userId", "" },            //笔记作者ID
                { "og_nickname", "" },          //笔记作者昵称
                { "og_title", "" },             //笔记标题
                { "og_description", "" },       //笔记描述
                { "og_images", new List<string>() },//笔记图片
                { "og_videos", new List<string>() },//笔记视频
                { "thumbsCount", "0" },             //点赞数
                { "collectCount", "0" },            //收藏数
                { "commentCount", "0" },            //回复数
                { "shareCount", "0" }               //分享数
            };

所有信息的读取从HTML中的__INITIAL_STATE__开始，以上部分都是乱七八糟的代码，我们不用管。

具体代码如下：

using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace XiaohongshuSpiderEXE
{
    public class ExtractNoteData
    {

        public static Dictionary<string, object> ExtractNoteDataFromHtml(string htmlContent)
        {
            var result = new Dictionary<string, object>
            {
                { "noteId", "" },               //笔记ID
                { "userId", "" },               //当前登录用户ID
                { "user_nickname", "" },        //当前登录用户昵称
                { "og_userId", "" },            //笔记作者ID
                { "og_nickname", "" },          //笔记作者昵称
                { "og_title", "" },             //笔记标题
                { "og_description", "" },       //笔记描述
                { "og_images", new List<string>() },//笔记图片
                { "og_videos", new List<string>() },//笔记视频
                { "thumbsCount", "0" },             //点赞数
                { "collectCount", "0" },            //收藏数
                { "commentCount", "0" },            //回复数
                { "shareCount", "0" }               //分享数
            };

            // 匹配 noteId
            var noteIdMatch = Regex.Match(htmlContent, "\"noteId\":\"([a-fA-F0-9]{24})\"");
            string noteId = noteIdMatch.Success ? noteIdMatch.Groups[1].Value : "";

            // 提取 og:title 内容
            var ogTitleMatch = Regex.Match(htmlContent,
                @"<meta[^>]*(name|property)=[\x22\x27]og:title[\x22\x27][^>]*content=[\x22\x27]([^>\x22\x27]*)[\x22\x27]",
                RegexOptions.IgnoreCase);
            result["og_title"] = ogTitleMatch.Success ? ogTitleMatch.Groups[2].Value : "";

            // 提取 og:description 内容
            var ogDescMatch = Regex.Match(htmlContent,
                @"<meta[^>]*(name|property)=[\x22\x27]og:description[\x22\x27][^>]*content=[\x22\x27]([^>\x22\x27]*)[\x22\x27]",
                RegexOptions.IgnoreCase);
            result["og_description"] = ogDescMatch.Success ? ogDescMatch.Groups[2].Value : "";

            // 提取所有 og:image 内容
            var ogImageMatches = Regex.Matches(htmlContent,
                @"<meta[^>]*(name|property)=[\x22\x27]og:image[\x22\x27][^>]*content=[\x22\x27]([^>\x22\x27]*)[\x22\x27]",
                RegexOptions.IgnoreCase);
            var ogImages = new List<string>();
            foreach (Match match in ogImageMatches)
            {
                if (match.Success)
                {
                    ogImages.Add(match.Groups[2].Value);
                }
            }
            result["og_images"] = ogImages;

            // 提取所有 og:video 内容
            var ogVideoMatches = Regex.Matches(htmlContent,
                @"<meta[^>]*(name|property)=[\x22\x27]og:video[\x22\x27][^>]*content=[\x22\x27]([^>\x22\x27]*)[\x22\x27]",
                RegexOptions.IgnoreCase);
            var ogVideos = new List<string>();
            foreach (Match match in ogVideoMatches)
            {
                if (match.Success)
                {
                    ogVideos.Add(match.Groups[2].Value);
                }
            }
            result["og_videos"] = ogVideos;


            // 第一步：提取 <script> 标签内容
            string scriptContent = ExtractScriptContent(htmlContent);
            if (string.IsNullOrEmpty(scriptContent))
            {
                Console.WriteLine("未找到包含 __INITIAL_STATE__ 的 <script> 标签");
                return result;
            }

            // 第二步：提取 JSON 字符串
            string jsonStr = ExtractInitialJsonFromScript(scriptContent);
            if (string.IsNullOrEmpty(jsonStr))
            {
                Console.WriteLine("未找到 __INITIAL_STATE__ 的 JSON 数据");
                return result;
            }

            // 第三步：解析 JSON 并提取字段
            try
            {
                JObject jObject = JObject.Parse(jsonStr);

                // 提取 userInfo 中的 userId 和 nickname
                var userInfo = jObject["user"]?["userInfo"] as JObject;
                if (userInfo != null)
                {
                    result["userId"] = userInfo["userId"]?.ToString();
                    result["user_nickname"] = userInfo["nickname"]?.ToString();
                }

                // 提取 note 下的 noteId, title, desc
                var note = jObject["note"]?["noteDetailMap"] as JObject;
                if (note != null)
                {
                    foreach (var prop in note.Properties())
                    {
                        var noteItem = prop.Value as JObject;
                        if (noteItem != null && noteItem.ContainsKey("note"))
                        {
                            var noteObj = noteItem["note"] as JObject;

                            if (noteObj != null)
                            {
                                result["noteId"] = prop.Name; // noteId 就是属性名
                                result["og_title"] = noteObj["title"]?.ToString();
                                result["og_description"] = noteObj["desc"]?.ToString();

                                // 提取作者信息
                                var userInNote = noteObj["user"] as JObject;
                                if (userInNote != null)
                                {
                                    result["og_userId"] = userInNote["userId"]?.ToString();
                                    result["og_nickname"] = userInNote["nickname"]?.ToString();
                                }

                                // 提取互动数据
                                var interactInfo = noteObj["interactInfo"] as JObject;
                                if (interactInfo != null)
                                {
                                    result["thumbsCount"] = interactInfo["likedCount"]?.ToString() ?? "0";
                                    result["collectCount"] = interactInfo["collectedCount"]?.ToString() ?? "0";
                                    result["commentCount"] = interactInfo["commentCount"]?.ToString() ?? "0";
                                    result["shareCount"] = interactInfo["shareCount"]?.ToString() ?? "0";
                                }
                            }
                        }

                    }
                }

            }
            catch (Exception ex)
            {
                Console.WriteLine("JSON 解析失败：" + ex.Message);
            }

            return result;
        }

        private static string ExtractScriptContent(string html)
        {
            var scriptRegex = new Regex(@"<script[^>]*>(.*?)</script>", RegexOptions.Singleline);
            foreach (Match match in scriptRegex.Matches(html))
            {
                string script = match.Groups[1].Value;
                if (script.Contains("__INITIAL_STATE__"))
                {
                    return script;
                }
            }
            return null;
        }

        private static string ExtractInitialJsonFromScript(string scriptContent)
        {
            int startIndex = scriptContent.IndexOf("window.__INITIAL_STATE__={");
            if (startIndex < 0) return null;

            startIndex = scriptContent.IndexOf('{', startIndex);
            if (startIndex < 0) return null;

            int openBraceCount = 1;
            int i = startIndex + 1;
            while (i < scriptContent.Length && openBraceCount > 0)
            {
                if (scriptContent[i] == '{') openBraceCount++;
                else if (scriptContent[i] == '}') openBraceCount--;
                i++;
            }
            if (openBraceCount == 0)
            {
                int length = i - startIndex;
                return scriptContent.Substring(startIndex, length);
            }
            return null;
        }
    }
}

3、将笔记信息交给AI改写，取得返回信息

再将取得的笔记正文转给AI，进行改写，具体操作请参考“.net core调Dify工作流，返回AI生成的结果” 和 “从零到精通：用Postman调试Dify工作流的实战指南”

4、小红书反读取

小红书机器人反拉取机制，操作慢一点，不然会被强制下线。

再次强调本文章仅限技术研究，不可用于任何违法犯罪行为！

保险利率又在调整了，近期有考虑配置的朋友欢迎咨询！

#Zui懂保险的IT架构师#-----求一键四连：关注、点赞、分享、收藏

上一篇： 5个被严重低估的C++标准库:告别重复造轮子，提升开发效率
下一篇： JavaScript DOM 内容操作常用方法和 XSS 注入攻击

网站首页 > 技术文章 正文